In [2]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import pprint as pp
import re

In [3]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

In [17]:
def count_tags(filename):
    """
    This function uses iterative parsing to process the map file and find out 
    not only what tags are there, but also how many, to get the
    feeling on how much of which data you can expect to have in the map.
    It should return a dictionary with the tag name as the key and number of 
    times this tag can be encountered in the map as value.
    """
    tags = {}
    for event, element in ET.iterparse(filename):
        if element.tag not in tags:
            tags[element.tag] = 1
        else:
            tags[element.tag] += 1
        
    return tags

In [18]:
count_tags("vancouver_canada.osm")

{'bounds': 1,
 'member': 13107,
 'nd': 1014262,
 'node': 819193,
 'osm': 1,
 'relation': 1821,
 'tag': 281433,
 'way': 157686}

In [None]:
def key_type(element, keys):
    """
    This is a helper function for comparing tags to potential problems and increments the count of the problem
    in the dictionary that keeps track of 4 of these problems.
    four tag categories in a dictionary:
    "lower", for tags that contain only lowercase letters and are valid,
    "lower_colon", for otherwise valid tags with a colon in their names,
    "problemchars", for tags with problematic characters, and
    "other", for other tags that do not fall into the other three categories.
    """
    if element.tag == "tag":
        for tag in element.iter("tag"):
            street = tag.attrib["k"]
            if lower.search(street):
                keys["lower"] += 1
            elif lower_colon.search(street):
                keys["lower_colon"] += 1
            elif problemchars.search(street):
                keys["problemchars"] += 1
            else:
                keys["other"] += 1
    return keys

In [None]:
def process_keys(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}

    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

In [None]:
process_keys("vancouver_canada.osm")

In [107]:
def get_users(element):
    users = set()
    index = "uid"
    user_tags = ["way", "relation"]

    tag = element.tag
    if tag == "node":
        try:
            users.add(element.attrib[index])
        except:
            users = users
    elif tag in user_tags:
        for member in element.iter(tag):
            try:
                if member.attrib["uid"]:
                    users.add(member.attrib[index])
            except:
                users = users
    return users

In [110]:
def process_users(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        for user in get_users(element):
            users.add(user)
    return users

In [111]:
len(process_users("vancouver_canada.osm"))

894

In [4]:
expected = ["Street", "Alley", "Avenue", "Boulevard", "Crescent", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Connector", "Way", "Highway"]


mapping = { "St": "Street", "St.": "Street", "street": "Street", "Steet": "Street",
            "Ave": "Avenue", 
            "Blvd": "Boulevard", "BLVD": "Boulevard",
            "Rd.": "Road"
          }

In [18]:
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

In [19]:
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

In [20]:
def update_name(name, mapping):
    m = street_type_re.search(name)
    if m:
        street_type = m.group()
        if street_type in mapping:
            name = name[:-len(street_type)] + mapping[street_type]
    return name

In [23]:
def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, update_name(tag.attrib['v'], mapping))
    osm_file.close()
    return street_types

In [24]:
audit("vancouver_canada.osm")

defaultdict(set,
            {'108': {'8th Ave W #108'},
             '203': {'Fir Street #203', 'Sidney St #203'},
             '216': {'Granville St #216'},
             '2nd': {'East 2nd'},
             '328': {'Granville St #328'},
             '701': {'Comox St #701'},
             'Broadway': {'Broadway',
              'E. Broadway',
              'East Broadway',
              'W Broadway',
              'West Broadway',
              'west Broadway'},
             'Denmanstreet': {'Denmanstreet'},
             'E': {'37th Ave E'},
             'East': {'3rd Street East', 'Grand Boulevard East'},
             'Esplanade': {'415 West Esplanade', 'West Esplanade'},
             'G101': {'West Broadway #G101'},
             'Greenway': {'Carrall Street Greenway'},
             'Jarvis': {'Jarvis'},
             'Jervis': {'Jervis'},
             'Kingsway': {'Kingsway'},
             'Mall': {'East Mall',
              'Health Sciences Mall',
              'Lower Mall',
           