In [130]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import pprint as pp
import re
import csv
import codecs
import cerberus
import schema

In [131]:
osm_path = "vancouver_canada.osm"

nodes_path = "nodes.csv"
node_tags_path = "node_tags.csv"
ways_path = "ways.csv"
way_nodes_path = "way_nodes.csv"
way_tags_path = "way_tags.csv"

In [132]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

In [133]:
def count_tags(filename):
    """
    This function uses iterative parsing to process the map file and find out 
    not only what tags are there, but also how many, to get the
    feeling on how much of which data you can expect to have in the map.
    It should return a dictionary with the tag name as the key and number of 
    times this tag can be encountered in the map as value.
    """
    tags = {}
    for event, element in ET.iterparse(filename):
        if element.tag not in tags:
            tags[element.tag] = 1
        else:
            tags[element.tag] += 1
        
    return tags

In [134]:
def key_type(element, keys):
    """
    This is a helper function for comparing tags to potential problems and increments 
    the count of the problem in the dictionary that keeps track of 4 of these problems.
    four tag categories in a dictionary:
    "lower", for tags that contain only lowercase letters and are valid,
    "lower_colon", for otherwise valid tags with a colon in their names,
    "problemchars", for tags with problematic characters, and
    "other", for other tags that do not fall into the other three categories.
    """
    if element.tag == "tag":
        for tag in element.iter("tag"):
            street = tag.attrib["k"]
            if lower.search(street):
                keys["lower"] += 1
            elif lower_colon.search(street):
                keys["lower_colon"] += 1
            elif problemchars.search(street):
                keys["problemchars"] += 1
            else:
                keys["other"] += 1
    return keys

In [135]:
def process_keys(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}

    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

In [136]:
def get_users(element):
    users = set()
    index = "uid"
    user_tags = ["way", "relation"]

    tag = element.tag
    if tag == "node":
        try:
            users.add(element.attrib[index])
        except:
            users = users
    elif tag in user_tags:
        for member in element.iter(tag):
            try:
                if member.attrib["uid"]:
                    users.add(member.attrib[index])
            except:
                users = users
    return users

In [137]:
def process_users(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        for user in get_users(element):
            users.add(user)
    return users

In [138]:
expected = ["Street", "Alley", "Avenue", "Boulevard", "Crescent", "Drive", 
            "Court", "Place", "Square", "Lane", "Road", "Mall", "Broadway",
            "Trail", "Parkway", "Commons", "Connector", "Walk", "Way", "Highway", 
            "Mews", "Kingsway", "Greenway", "Seawall", "South", "Terminal"]


mapping = {"St": "Street", "St.": "Street", "street": "Street", "Steet": "Street", 
           "Ave": "Avenue", "Venue": "Avenue", 
           "Blvd": "Boulevard", "BLVD": "Boulevard", 
           "Esplanade": "Esplanade Avenue", 
           "Rd.": "Road", 
           "Jervis": "Jervis Street", "Jarvis": "Jervis Street",
           "Pender": "Pender Street",
           "Nanaimo": "Nanaimo Street",
           "2nd": "2nd Avenue",
           "Denmanstreet": "Denman Street"}

In [139]:
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

In [140]:
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

In [141]:
def update_name(name, mapping):
    unique = ["108", "203", "216", "328", "701", "G101"]
    direction = ["East", "West"]

    m = street_type_re.search(name)
    if m:
        street_type = m.group()
        if street_type in mapping:
            name = name[:-len(street_type)] + mapping[street_type]
        elif street_type == 'Vancouver':
            name = name[:-len(street_type)].strip()
            return update_name(name, mapping)
        elif street_type in unique:
            parts = name.split("#")
            name = parts[1] + " " +parts[0].strip()
            return update_name(name, mapping)
        elif street_type in direction:
            parts = name.split(" ")
            name = parts[2] + " " +parts[0] + " " +parts[1]
            return update_name(name, mapping)
        elif street_type == "E":
            parts = name.split(" ")
            name = "East "+ parts[0]+ " Avenue"
        elif street_type == "W":
            parts = name.split(" ")
            if len(parts) > 2:
                name = parts[0]+ " West "+ parts[1]+ " Avenue"
            elif len(parts) < 3:
                name = "West "+ parts[0]

    return name

In [142]:
def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, update_name(tag.attrib['v'], mapping))
    osm_file.close()
    return street_types

In [143]:
SCHEMA = schema.Schema

NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

In [144]:
def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=problemchars, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    # YOUR CODE HERE
    if element.tag == 'node':
        try:
            for field in NODE_FIELDS:
                node_attribs[field] = (element.attrib[field])
            for tag in element.iter("tag"):
                tag_id = node_attribs["id"]
                tag_value = tag.attrib["v"]
                if is_street_name(tag):
                    tag_value = update_name(tag_value, mapping)
                tag_key = tag.attrib["k"]
                if problemchars.search(tag_key):
                    break
                if tag_key.find(":") == -1:
                    tag_type = default_tag_type
                else:
                    tag_key = tag_key[tag_key.find(":")+1:]
                    tag_type = tag_key[:tag_key.find(":")]
                tags.append({'id':tag_id, 'key':tag_key, 'value':tag_value, 'type':tag_type})
        except:
            pass
        return {'node': node_attribs, 'node_tags': tags}
    elif element.tag == 'way':
        try:
            position = 0
            for field in WAY_FIELDS:
                way_attribs[field] = (element.attrib[field])
            for member in element.iter("nd"):
                way_nodes.append({"id": way_attribs["id"], "node_id": member.attrib["ref"], "position": position})
                position += 1
            for tag in element.iter("tag"):
                tag_id = way_attribs["id"]
                tag_value = tag.attrib["v"]
                if is_street_name(tag):
                    tag_value = update_name(tag_value, mapping)
                tag_key = tag.attrib["k"]
                if problemchars.search(tag_key):
                    break
                if tag_key.find(":") == -1:
                    tag_type = default_tag_type
                else:
                    tag_key = tag_key[tag_key.find(":")+1:]
                    tag_type = tag_key[:tag_key.find(":")]
                tags.append({'id':tag_id, 'key':tag_key, 'value':tag_value, 'type':tag_type})
        except:
            pass
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}

In [145]:
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

In [146]:
def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))

In [147]:
class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)

In [149]:
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(nodes_path, 'w') as nodes_file, codecs.open(node_tags_path, 'w') as nodes_tags_file, \
         codecs.open(ways_path, 'w') as ways_file, codecs.open(way_nodes_path, 'w') as way_nodes_file, \
         codecs.open(way_tags_path, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])

In [150]:
process_map(osm_path, validate=True)

AttributeError: 'dict' object has no attribute 'iteritems'