In [2]:
import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET


OSMFILE="san-francisco_california.osm" 
OSM_PATH = OSMFILE

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']


mapping = {  "St": "Street",

            "St.": "Street",
           
            "st": "Street",

            "STREET": "Street",

            "Ave": "Avenue",

            "Ave.": "Avenue",

            "Dr.": "Drive",

            "Dr": "Drive",

            "Rd": "Road",

            "Rd.": "Road",

            "Blvd": "Boulevard",

            "Blvd.": "Boulevard",

            "Ehs": "EHS",

            "Trl": "Trail",

            "Cir": "Circle",

            "Cir.": "Circle",

            "Ct": "Court",

            "Ct.": "Court",

            "Crt": "Court",

            "Crt.": "Court",

            "By-pass": "Bypass",
           
            "Plz":"Plaza",

            "N.": "North",

            "N": "North",

            "E.": "East",

            "E": "East",

            "S.": "South",
           
            "Socity": "Society",           

            "S": "South",

            "W.": "West",

            "W": "West"

          }


def is_street_name(elem):# Check if it is a street name
    return (elem.attrib['k'] == "addr:street")

# return the updated names
def update_name(name, mapping):
    name1=name.split()
    for x in name1:
        if x in mapping:
            name1.remove(x)
            name1.append(mapping[x])
            name=' '.join(name1)
    return name


# This function will check if this element has postal code
def is_zipcode(elem):
    return (elem.attrib['k'] == "addr:postcode")


# This function will update the zipcode
def update_zip(zipcode):
    zipChar = re.findall('[a-zA-Z]*', zipcode)
    if zipChar:
        zipChar = zipChar[0]
    zipChar.strip()
    if zipChar == "CA" or "ca":
        updateZip = re.findall(r'\d+', zipcode)
        if updateZip:
            return (re.findall(r'\d+', zipcode))[0]
    else:
        return (re.findall(r'\d+', zipcode))[0]


def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,

                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):

    """Clean and shape node or way XML element to Python dict"""



    node_attribs = {}

    way_attribs = {}

    way_nodes = []

    tags = []  # Handle secondary tags the same way for both node and way elements


    if element.tag == 'node':

        for attrib in element.attrib:

            if attrib in NODE_FIELDS:

                node_attribs[attrib] = element.attrib[attrib]

        

        for child in element:

            node_tag = {}

            if LOWER_COLON.match(child.attrib['k']):

                node_tag['type'] = child.attrib['k'].split(':',1)[0]

                node_tag['key'] = child.attrib['k'].split(':',1)[1]

                node_tag['id'] = element.attrib['id']

                node_tag['value'] = child.attrib['v']

                tags.append(node_tag)

            elif PROBLEMCHARS.match(child.attrib['k']):

                continue

            else:

                node_tag['type'] = 'regular'

                node_tag['key'] = child.attrib['k']

                node_tag['id'] = element.attrib['id']

                node_tag['value'] = child.attrib['v']
             
                    
                tags.append(node_tag)

        

        return {'node': node_attribs, 'node_tags': tags}

        

    elif element.tag == 'way':

        for attrib in element.attrib:

            if attrib in WAY_FIELDS:

                way_attribs[attrib] = element.attrib[attrib]

        

        position = 0

        for child in element:

            way_tag = {}

            way_node = {}

            

            if child.tag == 'tag':

                if LOWER_COLON.match(child.attrib['k']):

                    way_tag['type'] = child.attrib['k'].split(':',1)[0]

                    way_tag['key'] = child.attrib['k'].split(':',1)[1]

                    way_tag['id'] = element.attrib['id']

                    way_tag['value'] = child.attrib['v']
                    
                    #print child.attrib['k'].split(':',1)[1]
                        
                    if is_zipcode(child):
            
                        zipcode = child.attrib['v']
                        #update the zipcode, save it in way_tag
                        way_tag['value'] = update_zip(zipcode)
                        print update_zip(zipcode)
                        
                    elif is_street_name(child):
                        
                        streetname=child.attrib['v']
                        #update the street name
                        way_tag['value']=update_name(streetname,mapping)
                        print update_name(streetname,mapping)
                    else:
                        continue
                        
                    
                    

                    tags.append(way_tag)

                elif PROBLEMCHARS.match(child.attrib['k']):

                    continue
                    
                elif PROBLEMCHARS.match(child.attrib['v']):
                    
                    continue

                else:

                    way_tag['type'] = 'regular'

                    way_tag['key'] = child.attrib['k']

                    way_tag['id'] = element.attrib['id']

                    way_tag['value'] = child.attrib['v']
                    
                    

                    tags.append(way_tag)

                    

            elif child.tag == 'nd':

                way_node['id'] = element.attrib['id']

                way_node['node_id'] = child.attrib['ref']

                way_node['position'] = position

                position += 1

                way_nodes.append(way_node)
                
               

        

        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}



# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()





class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

    

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                #if validate is True:
                   # validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=True)

Willie Mays Plaza
94107
La Loma Avenue
La Loma Avenue
Bellevue Avenue
Olympus Avenue
Haste Street
La Loma Avenue
Filbert Street
Coleman Place
94030
94030
94030
East 5th Avenue
94941
Saint Jude Road
94965
Harding Road
94132
La Loma Avenue
La Loma Avenue
La Loma Avenue
La Loma Avenue
Mountain Boulevard
94611
9th Street
94607
Washington Street
Broadway
94607
94610
Bay Place
Mandela Parkway
94608
94608
Mandela Parkway
Horton Street
94608
94608
40th Street
Shellmound Street
94608
Oregon Street
Adeline Street
94621
Coliseum Way
Doolittle Drive
94603
Airport Boulevard
Anza Boulevard
Airport Boulevard
East 3rd Avenue
94404
94128
Airport South Boulevard
Oracle Parkway
94065
Oracle Parkway
94065
Oracle Parkway
94065
Oracle Parkway
94065
Oracle Parkway
94065
Oracle Parkway
94065
Willow Road
94025
Willow Road
94025
Willow Road
Willow Road
94025
Willow Road
94114
Pleasant Hill Road
94523
La Loma Avenue
Alameda De Las Pulgas
94002
Dartmouth Avenue
94070
El Cerrito Plaza
95430
El Cerrito Plaza
95430
