In [41]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import pprint as pp
import re

In [42]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

In [43]:
def count_tags(filename):
    """
    This function uses iterative parsing to process the map file and find out 
    not only what tags are there, but also how many, to get the
    feeling on how much of which data you can expect to have in the map.
    It should return a dictionary with the tag name as the key and number of 
    times this tag can be encountered in the map as value.
    """
    tags = {}
    for event, element in ET.iterparse(filename):
        if element.tag not in tags:
            tags[element.tag] = 1
        else:
            tags[element.tag] += 1
        
    return tags

In [44]:
#count_tags("vancouver_canada.osm")

In [45]:
def key_type(element, keys):
    """
    This is a helper function for comparing tags to potential problems and increments 
    the count of the problem in the dictionary that keeps track of 4 of these problems.
    four tag categories in a dictionary:
    "lower", for tags that contain only lowercase letters and are valid,
    "lower_colon", for otherwise valid tags with a colon in their names,
    "problemchars", for tags with problematic characters, and
    "other", for other tags that do not fall into the other three categories.
    """
    if element.tag == "tag":
        for tag in element.iter("tag"):
            street = tag.attrib["k"]
            if lower.search(street):
                keys["lower"] += 1
            elif lower_colon.search(street):
                keys["lower_colon"] += 1
            elif problemchars.search(street):
                keys["problemchars"] += 1
            else:
                keys["other"] += 1
    return keys

In [46]:
def process_keys(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}

    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

In [47]:
#process_keys("vancouver_canada.osm")

In [48]:
def get_users(element):
    users = set()
    index = "uid"
    user_tags = ["way", "relation"]

    tag = element.tag
    if tag == "node":
        try:
            users.add(element.attrib[index])
        except:
            users = users
    elif tag in user_tags:
        for member in element.iter(tag):
            try:
                if member.attrib["uid"]:
                    users.add(member.attrib[index])
            except:
                users = users
    return users

In [49]:
def process_users(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        for user in get_users(element):
            users.add(user)
    return users

In [50]:
#process_users("vancouver_canada.osm")

In [113]:
expected = ["Street", "Alley", "Avenue", "Boulevard", "Crescent", "Drive", 
            "Court", "Place", "Square", "Lane", "Road", "Mall", "Broadway",
            "Trail", "Parkway", "Commons", "Connector", "Walk", "Way", "Highway", 
            "Mews", "Kingsway", "Greenway", "Seawall", "South", "Terminal"]


mapping = {"St": "Street", "St.": "Street", "street": "Street", "Steet": "Street", 
           "Ave": "Avenue", "Venue": "Avenue", 
           "Blvd": "Boulevard", "BLVD": "Boulevard", 
           "Esplanade": "Esplanade Avenue", 
           "Rd.": "Road", 
           "Jervis": "Jervis Street", "Jarvis": "Jervis Street",
           "Pender": "Pender Street",
           "Nanaimo": "Nanaimo Street",
           "2nd": "2nd Avenue",
           "Denmanstreet": "Denman Street"}

In [108]:
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

In [109]:
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

In [157]:
def update_name(name, mapping):
    unique = ["108", "203", "216", "328", "701", "G101"]
    direction = ["East", "West"]

    m = street_type_re.search(name)
    if m:
        street_type = m.group()
        if street_type in mapping:
            name = name[:-len(street_type)] + mapping[street_type]
        elif street_type == 'Vancouver':
            name = name[:-len(street_type)].strip()
            update_name(name, mapping)
        elif street_type in unique:
            parts = name.split("#")
            name = parts[1] + " " +parts[0].strip()
            update_name(name, mapping)
        elif street_type in direction:
            parts = name.split(" ")
            name = parts[2] + " " +parts[0] + " " +parts[1]
            update_name(name, mapping)
        elif street_type == "E":
            parts = name.split(" ")
            name = "East "+ parts[0]+ " Avenue"
        elif street_type == "W":
            parts = name.split(" ")
            if len(parts) > 2:
                name = parts[0]+ " West "+ parts[1]+ " Avenue"
            elif len(parts) < 3:
                name = "West "+ parts[0]

    return name

In [158]:
def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, update_name(tag.attrib['v'], mapping))
    osm_file.close()
    return street_types

In [159]:
audit("vancouver_canada.osm")

defaultdict(set,
            {'St': {'203 Sidney St',
              '216 Granville St',
              '328 Granville St',
              '701 Comox St'},
             'St.': {'W. Hastings St.'},
             'W': {'108 8th Ave W'}})