In [1]:
import xml.etree.ElementTree as ET 
import pprint 
from collections import defaultdict 
import re

In [2]:
infile="houston_texas.osm"

In [3]:
def count_tags(filename): 
    '''Iterate through each element in the file and add the relevant node name to a dictionary 
    the first time with a value of 1 and then increment by 1 each time that node appears again.''' 
    #initialize defaultdict to avoid KeyError and allow new keys not found in dictionary yet 
    tags = defaultdict(int) 
    #iterate through each node element and increment the dictionary value for that node.tag key 
    for event, node in ET.iterparse(filename): 
        if event == 'end':  
            tags[node.tag]+=1 
        # discard the element is needed to clear from memory and speed up processing 
        node.clear()              
    return tags 
 
 
tags = count_tags('Houston_texas.osm') 
pprint.pprint(tags) 


defaultdict(<type 'int'>, {'node': 2865568, 'nd': 3410766, 'bounds': 1, 'member': 27143, 'tag': 2059263, 'relation': 2362, 'way': 344212, 'osm': 1})


In [4]:

#create the three regular expressions we are checking for 
lower = re.compile(r'^([a-z]|_)*$') 
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$') 
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]') 

def key_type(element, keys): 
    if element.tag == "tag": 
        if re.search(lower, element.attrib['k']):         
            keys['lower'] += 1             
        elif re.search(lower_colon, element.attrib['k']): 
            keys['lower_colon'] += 1             
        elif re.search(problemchars, element.attrib['k']): 
            keys['problemchars'] += 1 
            #print out any values with problematic characters 
            #print element             
            print element.attrib['k']             
        else: 
            keys['other'] += 1                       
    return keys 
 
def process_map(filename): 
    #initialize dictionary 
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0} 
    for _, element in ET.iterparse(filename): 
        keys = key_type(element, keys) 
        # discard the element is needed to clear from memory and speed up processing 
        element.clear() 
    return keys  
 
keys = process_map('Houston_texas.osm') 
pprint.pprint(keys) 


service area
service area
{'lower': 839457, 'lower_colon': 1164223, 'other': 55581, 'problemchars': 2}


In [5]:
def process_map(filename): 
    users = set() 
    for _, element in ET.iterparse(filename): 
        try: 
            users.add(element.attrib['uid']) 
        except KeyError: 
            pass 
        element.clear() #to clear memory 
    return users 
 
users = process_map('Houston_texas.osm') 
print len(users) 

1293


In [5]:
OSMFILE = "Houston_texas.osm" 
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE) 

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road",  
             "Trail", "Parkway", "Commons"] 

# UPDATE THIS VARIABLE            
mapping = {'10': '10', 
           '3305': '3305', 
           '32500': '32500', 
           '77A': '77A Avenue', 
           '8500': '8500', 
           '99': '99', 
           'Alley': 'Alley', 
           'Ave': 'Avenue', 
           'Ave.': 'Avenue', 
           'Blvd': 'Boulevard', 
           'Broadway': 'Broadway', 
           'Bypass': 'Bypass', 
           'Centre': 'Centre', 
           'Close': 'Close', 
           'Crescent': 'Crescent', 
           'Diversion': 'Diversion', 
           'Dr': 'Drive', 
           'Dr.': 'Drive', 
           'East': 'East', 
           'Edmonds': 'Edmonds Street', 
           'Gate': 'Gate', 
           'Grove': 'Grove', 
           'Hastings': 'Hastings Street', 
           'Highway': 'Highway', 
           'Hwy': 'Highway', 
           'Hwy.': 'Highway', 
           'Kingsway': 'Kingsway', 
           'Mall': 'Mall', 
           'Mews': 'Mews', 
           'Moncton': 'Moncton Street', 
           'North': 'North', 
           'Park': 'Park', 
           'Pender': 'Pender Street', 
           'RD': 'Road', 
           'Rd': 'Road', 
           'Rd.': 'Road', 
           'Road,': 'Road', 
           'S.': 'South', 
           'Sanders': 'Sanders Street', 
           'South': 'South', 
           'St': 'Street', 
           'St.': 'Street', 
           'Street3': 'Street', 
           'Terminal': 'Terminal', 
           'Tsawwassen': 'North Tsawwassen', 
           'Vancouver': 'Vancouver', 
           'Walk': 'Walk', 
           'Way': 'Way', 
           'West': 'West', 
           'Willingdon': 'Willingdon', 
           'Wynd': 'Wynd', 
           'av': 'Avenue', 
           'road': 'Road', 
           'st': 'Street', 
           'street': 'Street', 
          } 
#gotten from looking at the results of the audit 
#difference between mapping and changes is we are looking at the entire value in changes and not just the last word. see data.py 
#changes = { 'ing George Hwy.': 'King George Boulevard', 
#           'W15th st': 'W 15th Street', 
#           'Howe St. Vancouver': 'Howe Street', 
#           'W. Hastings St. Vancouver': 'West Hastings Street', 
#           'Expo Blvd, #3305': 'Expo Boulevard'             
#           ' Beatty St': 'Beatty Street'}          
##skip are the full values gotten from looking at the results of the audit that we don't want to include in the database. Will use in data.py 
#skip = ["10","32500","99","Tsawwassen","Park","Terminal","8500"] 

In [6]:
def audit_street_type(street_types, street_name): 
    m = street_type_re.search(street_name) 
    if m: 
        street_type = m.group() 
        if street_type not in expected: 
            street_types[street_type].add(street_name) 

In [7]:
def is_street_name(elem): 
    return (elem.attrib['k'] == "addr:street") 

In [8]:
def audit(osmfile): 
    osm_file = open(osmfile, "r") 
    street_types = defaultdict(set) 
    for event, elem in ET.iterparse(osm_file, events=("start",)): 
        if elem.tag == "node" or elem.tag == "way": 
            for tag in elem.iter("tag"): 
                if is_street_name(tag): 
                    audit_street_type(street_types, tag.attrib['v']) 
    elem.clear() #clear from memory 
    return street_types  

In [9]:
def update_name(name, mapping): 
    '''Update each street name with the replacement ending in the mapping dictionary''' 
    match = re.search(street_type_re,name) 
    name = re.sub(street_type_re,mapping[match.group()],name) 
    return name

In [10]:
st_types = audit(OSMFILE) 
pprint.pprint(dict(st_types))

{'1/2': set(['Avenue M 1/2', 'Avenue R 1/2']),
 '10': set(['I 10']),
 '110': set(['Memorial Drive, Ste 110']),
 '1142': set(['Lake Woodlands Drive #1142']),
 '125': set(['798 Sorella Court Suite 125']),
 '146': set(['TX 146', 'Texas 146']),
 '1464': set(['FM 1464']),
 '1488': set(['FM 1488', 'Farm-to-Market Road 1488']),
 '150': set(['Garth Road Suite 150']),
 '160': set(['Nelson Way #160']),
 '1663': set(['FM 1663']),
 '1764': set(['Farm-to-Market Road 1764']),
 '18': set(['800 W NASA Parkway #18']),
 '180': set(['Katy Freeway #180']),
 '185': set(['I-45 South, Suite 185']),
 '1960': set(['FM 1960', 'Fm 1960']),
 '200': set(['College Park Drive Ste 200']),
 '2100': set(['FM 2100']),
 '240': set(['Bissonnet St #240']),
 '242': set(['SH 242']),
 '249': set(['TX 249']),
 '270': set(['FM 270']),
 '290': set(['Highway 290', 'US 290', 'Windfern Rd #290']),
 '2920': set(['FM 2920', 'Fm 2920']),
 '300': set(['Town & Country Blvd #300']),
 '332': set(['TX 332']),
 '359': set(['FM 359', 'Fm 359

In [11]:
for st_type, ways in st_types.iteritems(): 
    for name in ways: 
        better_name = update_name(name, mapping) 
        print name, "=>", better_name 


KeyError: '1142'