In [20]:
import xml.etree.cElementTree as ET
import pprint as pp
import re

In [21]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

In [18]:
def count_tags(filename):
    """
    This function uses iterative parsing to process the map file and find out 
    not only what tags are there, but also how many, to get the
    feeling on how much of which data you can expect to have in the map.
    It should return a dictionary with the tag name as the key and number of 
    times this tag can be encountered in the map as value.
    """
    tags = {}
    for event, element in ET.iterparse(filename):
        if element.tag not in tags:
            tags[element.tag] = 1
        else:
            tags[element.tag] += 1
        
    return tags

In [19]:
count_tags("vancouver_canada.osm")

{'bounds': 1,
 'member': 13107,
 'nd': 1014262,
 'node': 819193,
 'osm': 1,
 'relation': 1821,
 'tag': 281433,
 'way': 157686}

In [109]:
def key_type(element, keys):
    """
    This is a helper function for comparing tags to potential problems and increments the count of the problem
    in the dictionary that keeps track of 4 of these problems.
    four tag categories in a dictionary:
    "lower", for tags that contain only lowercase letters and are valid,
    "lower_colon", for otherwise valid tags with a colon in their names,
    "problemchars", for tags with problematic characters, and
    "other", for other tags that do not fall into the other three categories.
    """
    if element.tag == "tag":
        for tag in element.iter("tag"):
            street = tag.attrib["k"]
            if lower.search(street):
                keys["lower"] += 1
            elif lower_colon.search(street):
                keys["lower_colon"] += 1
            elif problemchars.search(street):
                keys["problemchars"] += 1
            else:
                keys["other"] += 1
    return keys

In [91]:
def process_keys(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}

    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

In [93]:
process_keys("vancouver_canada.osm")

{'lower': 248710, 'lower_colon': 29834, 'other': 2888, 'problemchars': 1}

In [162]:
def get_user(element):
    """
    This function finds out how many unique users have contributed to the map in this particular area!
    """
    user = set()
    index = "user"
    if element.tag == "node":
        for member in element.iter("node"):
            user.add(member.attrib[index])
    if element.tag == "way":
        for member in element.iter("way"):
            user.add(member.attrib[index])
    if element.tag == "relation":
        for member in element.iter("relation"):
            user.add(member.attrib[index])
    return user

In [163]:
def process_users(filename):
    users = set()

    for _, element in ET.iterparse(filename, events=("start",)):
        for user in get_user(element):
            users.add(user)

    return users

In [164]:
process_users("vancouver_canada.osm")

KeyError: 'user'