In [13]:
import unicodecsv as csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

import cerberus
#import schema

OSM_PATH = "./Data/portland_oregon.osm/portland_oregon.osm"
#OSM_PATH = "./Data/sample.osm"
#NODES_PATH = "./Data/Sample_Validated/sample_nodes.csv"
#NODE_TAGS_PATH = "./Data/Sample_Validated/sample_nodes_tags.csv"
#WAYS_PATH = "./Data/Sample_Validated/sample_ways.csv"
#WAY_NODES_PATH = "./Data/Sample_Validated/sample_ways_nodes.csv"
#WAY_TAGS_PATH = "./Data/Sample_Validated/sample_ways_tags.csv"



NODES_PATH = "./Data/Validated/nodes.csv"
NODE_TAGS_PATH = "./Data/Validated/nodes_tags.csv"
WAYS_PATH = "./Data/Validated/ways.csv"
WAY_NODES_PATH = "./Data/Validated/ways_nodes.csv"
WAY_TAGS_PATH = "./Data/Validated/ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')



schema = {
    'node': {
        'type': 'dict',
        'schema': {
            'id': {'required': True, 'type': 'integer', 'coerce': int},
            'lat': {'required': True, 'type': 'float', 'coerce': float},
            'lon': {'required': True, 'type': 'float', 'coerce': float},
            'user': {'required': True, 'type': 'string'},
            'uid': {'required': True, 'type': 'integer', 'coerce': int},
            'version': {'required': True, 'type': 'string'},
            'changeset': {'required': True, 'type': 'integer', 'coerce': int},
            'timestamp': {'required': True, 'type': 'string'}
        }
    },
    'node_tags': {
        'type': 'list',
        'schema': {
            'type': 'dict',
            'schema': {
                'id': {'required': True, 'type': 'integer', 'coerce': int},
                'key': {'required': True, 'type': 'string'},
                'value': {'required': True, 'type': 'string'},
                'type': {'required': True, 'type': 'string'}
            }
        }
    },
    'way': {
        'type': 'dict',
        'schema': {
            'id': {'required': True, 'type': 'integer', 'coerce': int},
            'user': {'required': True, 'type': 'string'},
            'uid': {'required': True, 'type': 'integer', 'coerce': int},
            'version': {'required': True, 'type': 'string'},
            'changeset': {'required': True, 'type': 'integer', 'coerce': int},
            'timestamp': {'required': True, 'type': 'string'}
        }
    },
    'way_nodes': {
        'type': 'list',
        'schema': {
            'type': 'dict',
            'schema': {
                'id': {'required': True, 'type': 'integer', 'coerce': int},
                'node_id': {'required': True, 'type': 'integer', 'coerce': int},
                'position': {'required': True, 'type': 'integer', 'coerce': int}
            }
        }
    },
    'way_tags': {
        'type': 'list',
        'schema': {
            'type': 'dict',
            'schema': {
                'id': {'required': True, 'type': 'integer', 'coerce': int},
                'key': {'required': True, 'type': 'string'},
                'value': {'required': True, 'type': 'string'},
                'type': {'required': True, 'type': 'string'}
            }
        }
    }
}


SCHEMA = schema

NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

mapping_street = {"St.":"Street", 
           "St":"Street",
           "Ave.":"Avenue", 
           "AVE":"Avenue",
           "Ave":"Avenue",
           "Blvd.":"Boulevard",
           "Cir":"Circle",
           "Dr":"Drive",
           "Dr.":"Drive", 
           "Ct.":"Court",
           "Pl.":"Place", 
           "Sq.":"Square",
           "Hwy":"Highway",
           "Ln.":"Lane", 
           "Rd.":"Road", 
           "Tr.":"Trail",
           "Pkwy":"Parkway",
           "Pky":"Parkway",
           "Cmmn":"Commons",
           "Rd":"Road",
           "Rd.":"Road"
          }


mapping_city = {"Portlan": "Portland",
            "97086": "Happy Valley",
            "Vancoucer": "Vancouver",
            "Beaverton, OR":"Beaverton",
            "Portland, OR":"Portland",
            "Portland, Oregon":"Portland",
            }


expected_street = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway","Circle", "Commons"]

expected_city = ["Banks","Beaverton","Cornelius","Durham","Forest Grove","Gaston","Hillsboro","King City",
                 "Lake Oswego","North Pplains","Portland","Rivergrove","Sherwood","Tigard","Tualitin","Wilsonwille",
                "Barlow","Canby","Happy Valley","Mollala","Sandy","Johnson City","Oregon City","Estacada","West Linn",
                "Gladstone","Milwaukie","Clatskanie","Columbia City","Prescott","Rainier","St. Helens","Scappoose",
                 "Vernonia","Fairview","Gresham","Maywood Park","Troutdale","Wood Village","Amity","Carlton","Dayton",
                "Dundee","Lafayette","Newberg","McMinnville ","Sheridan","Willamina","Yamhill","Battle Ground","Camas",
                "La Center","Ridgefield","Vancouver","Washougal","Woodland ","North Bonneville","Stevenson "]

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


def update_street(street, mapping_street):
    m = street_type_re.search(street)
    if m.group() not in expected_street:
        if m.group() in mapping_street.keys():
            street = re.sub(m.group(), mapping_street[m.group()], street)
    return street

def update_city(city,mapping_city):
    m = street_type_re.search(city)
    if m.group() not in expected_city:
        if m.group() in mapping_city.keys():
            street = re.sub(m.group(), mapping_city[m.group()], city)
    return city
    
def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    if element.tag == 'node':
        for attrib in element.attrib:
            if attrib in NODE_FIELDS:
                try:
                    node_attribs[attrib] = element.attrib[attrib]
                except:
                    node_attribs[attrib] = "9999999"

        for child in element:
            node_tag = {}
            if LOWER_COLON.match(child.attrib['k']):
                node_tag['type'] = child.attrib['k'].split(':', 1)[0]
                node_tag['key'] = child.attrib['k'].split(':', 1)[1]
                node_tag['id'] = element.attrib['id']
                node_tag['value'] = child.attrib['v']
                if child.attrib['k'] == 'addr:street':
                    node_tag["value"] = update_street(child.attrib["v"],mapping_street)
                elif child.attrib['k'] == 'addr:city':
                    node_tag['value'] = update_city(child.attrib['v'],mapping_city)
                tags.append(node_tag)
            elif PROBLEMCHARS.match(child.attrib['k']):
                continue
            else:
                node_tag['type'] = 'regular'
                node_tag['key'] = child.attrib['k']
                node_tag['id'] = element.attrib['id']
                node_tag['value'] = child.attrib['v']
                tags.append(node_tag)

        return {'node': node_attribs, 'node_tags': tags}

    elif element.tag == 'way':
        for attrib in element.attrib:
            if attrib in WAY_FIELDS:
                try:
                    way_attribs[attrib] = element.attrib[attrib]
                except:
                    way_attribs[attrib] = "9999999"

        position = 0
        for child in element:
            way_tag = {}
            way_node = {}

            if child.tag == 'tag':
                if LOWER_COLON.match(child.attrib['k']):
                    way_tag['type'] = child.attrib['k'].split(':', 1)[0]
                    way_tag['key'] = child.attrib['k'].split(':', 1)[1]
                    way_tag['id'] = element.attrib['id']
                    way_tag['value'] = child.attrib['v']
                    if child.attrib['k'] == 'addr:street':
                        way_tag['value'] = update_street(child.attrib['v'],mapping_street)
                    elif child.attrib['k'] == 'addr:city':
                        way_tag['value'] = update_city(child.attrib['v'],mapping_city)
                    tags.append(way_tag)
                elif PROBLEMCHARS.match(child.attrib['k']):
                    continue
                else:
                    way_tag['type'] = 'regular'
                    way_tag['key'] = child.attrib['k']
                    way_tag['id'] = element.attrib['id']
                    way_tag['value'] = child.attrib['v']
                    tags.append(way_tag)

            elif child.tag == 'nd':
                way_node['id'] = element.attrib['id']
                way_node['node_id'] = child.attrib['ref']
                way_node['position'] = position
                position += 1
                way_nodes.append(way_node)

        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}


# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(filename, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(filename, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.items())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)

        raise Exception(message_string.format(field, error_string))




# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'wb') as nodes_file, \
            codecs.open(NODE_TAGS_PATH, 'wb') as nodes_tags_file, \
            codecs.open(WAYS_PATH, 'wb') as ways_file, \
            codecs.open(WAY_NODES_PATH, 'wb') as way_nodes_file, \
            codecs.open(WAY_TAGS_PATH, 'wb') as way_tags_file:

        nodes_writer = csv.DictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = csv.DictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = csv.DictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = csv.DictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = csv.DictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=True)