In [37]:
## SECTION TO CREATE A SAMPLE FILE FROM THE FULL MAP FROM UDACITY

import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

DIRECTORY = "C:\Projects\Udacity\\"
OSM_FILE = DIRECTORY + "berlin_germany.osm"  # Replace this with your osm file
SAMPLE_FILE = DIRECTORY + "berlin_germany_sample.osm"

#set to 1000 for submission file
k = 1000 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

In [2]:

import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

DIRECTORY = "C:\Projects\Udacity\\"

OSM_PATH = DIRECTORY + "berlin_germany.osm"
#OSM_PATH = DIRECTORY + "berlin_germany_sample.osm"

NODES_PATH = DIRECTORY + "nodes.csv"
NODE_TAGS_PATH = DIRECTORY + "nodes_tags.csv"
WAYS_PATH = DIRECTORY + "ways.csv"
WAY_NODES_PATH = DIRECTORY + "ways_nodes.csv"
WAY_TAGS_PATH = DIRECTORY + "ways_tags.csv"


### UPDATED TO ALLOW : and . in one
### UPDATED LOWER COLON TO HAVE ANY CHAR LEFT or RIGHT OF COLON (initial colon) (problem chars being checked for already)
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\, \t\r\n]')
LOWER_COLON = re.compile(r'^(.)+:(.)+')

#SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

#vars that require special cleanup
KNOWNLISTTYPES = ['postal_codes']
KNOWN_NODEDICTTYPES = ['addr']
KNOWN_WAYDICTTYPES = ['addr']

## function returns valid attributes and type
## perfroms split on first colon for value (if exists), and second part is type, otherwise type is regular
def return_corrected_and_type(attribute):
    ## ADDED STRIP TO CLEAR EMPTY CHARS AM ENDE STRING
    
    attribute = attribute.strip()    
    m = PROBLEMCHARS.search(attribute)
    if m == None:
        m = LOWER_COLON.search(attribute)
        if m:
            attribname = attribute.split(':')
            return ':'.join(attribname[1:]), attribname[0]
        else:
            return attribute, 'regular'
    #else:
     #   print attribute

#Function that performs data cleanup based on attribute/value.
def cleanups(attribute, value):
    ## if the function returns None this suggests the node contains irrevelant data and will be dismissed 
    ## an example is invalid postal codes
    
    ## KEY CLEANUPS
    
    ## CLEANING POSTAL CODES TO MATCH
    if attribute in ['postal_code','addr:postcode', 'postal_codes']:
        attribute = 'postcode'
    
    ## CLEANING COUNTRY TO DE
    if attribute in ['country']:
        if value in ['Deutschland', 'Germany']:
            value = 'DE'

    ## PURGING POLISH LOCATIONS THAT USE SIMC ADDRESS CODES
    if attribute in ['simc', 'addr:simc', 'city:simc']:
        return None

    ## if the value is a list type, return corrected name and value 
    if type(value) is list:
        return attribute, value
        
    ## VALUE TESTING
    
    # testing for correct postcodes
    if attribute == 'postcode':
        #postcode should be 5 chars starting with 1
        # could convert to an int
        if len(value) != 5:
            return None
        else:
            posttoint = int(value)
            # testing range, filter out Polish data
            if posttoint < 10115 or posttoint > 14199:
                return None
            
    ### correcting the str. at end of street names or Str.
    if attribute == 'street' or attribute == 'addr:street':
        if 'str.' in value:
            value = value.replace('str.', 'straße'.decode('utf8'))
        elif ' Str.' in value:
            value = value.replace(' Str.', ' Straße'.decode('utf8'))
        
    return attribute, value

## function could be enhanced for other fields that are passing a list
def list_data(attribute, value):
    taglist = []
    if attribute == "postal_codes":
        for val in value.split(','):
            taglist.append(val)
    
    return taglist

#function to return dict data for complex types
def dict_data(attribute, value):
    dictlist = {}
    '''
    #For a street we can split the street into name and type - IDEA SCRAPPED
    if attribute == 'street':
        
        vals = value.split(',')

        if len(vals) == 2:
            valsstreet = vals[0].split(' ')
            dictlist['street'], dictlist['housenumber'] = ' '.join(valsstreet[:len(valsstreet)-1]).strip(), valsstreet[len(valsstreet)-1:].strip()
            dictlist['city'] = vals[2].strip()

            
        ## if no split, leave it as is
    '''
    dictlist[attribute] = value
    return dictlist
    
# main function returns cleaned/wrangled dict of node/node tags or way/way tags
def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    if element.tag == 'node':
        ##Assign Node Attributes       
        
        ## HANDLING FOR ATTRIBUTES THAT ARE MISSING DATA (ie EMPTYS)
        for attrib in NODE_FIELDS:
            if attrib not in element.attrib:
                node_attribs[attrib] = ''
            else:
                node_attribs[attrib] = element.attrib[attrib]
                           
        ##Assign Node Tags Attributes
        for tag in element.findall("tag"):
            tag_d = {}
            
            ## if the corrected key/type are invalid we do not add them            
            if return_corrected_and_type(tag.attrib['k']) != None: 
                tag_d['id'] = node_attribs['id']
                tag_d['key'], tag_d['type'] = return_corrected_and_type(tag.attrib['k']) 
                
                #if key attrib is a knownlist, we can append multiple tags 
                if tag.attrib['k'] in KNOWNLISTTYPES:
                    taglist = list_data(tag.attrib['k'], tag.attrib['v'])
                    
                    for vals in taglist:
                        temp_d = {}
                        temp_d["id"] = tag_d["id"]
                        temp_d["type"] = tag_d["type"]
                        temp_d['key'], temp_d['value'] = cleanups(tag_d['key'], vals)
                        tags.append(temp_d)
                #same idea for dict. types        
                elif tag_d['key'] in KNOWN_NODEDICTTYPES:
                    tagdict = dict_data(tag_d['key'], tag.attrib['v'])
                    
                    for tagkey in tagdict:
                        temp_d = {}
                        temp_d["id"] = tag_d["id"]
                        temp_d["type"] = tag_d["type"]
                        temp_d['key'], temp_d['value'] = cleanups(tagkey, tagdict[tagkey])
                        tags.append(temp_d)                        
                #otherwise add the tag (if the cleanup is valid)
                else:     
                    tag_d['value'] = tag.attrib['v']                
                
                    if cleanups(tag_d['key'], tag_d['value']) == None:
                        return None
                    else:
                        tag_d['key'], tag_d['value'] = cleanups(tag_d['key'], tag_d['value'])
                    tags.append(tag_d)

        return {'node': node_attribs, 'node_tags': tags}
    ## same as if element.tag == node
    elif element.tag == 'way':
        
        for attrib in WAY_FIELDS:
            if attrib not in element.attrib:
                ## Some early data was missing user attributes
                way_attribs[attrib] = ''
            else:
                way_attribs[attrib] = element.attrib[attrib]

        position = 0
        for tag in element.findall("nd"):
            tag_d = {}
            tag_d['id'] = way_attribs['id']
            tag_d['node_id'] = tag.attrib['ref']
            tag_d['position'] = position
            way_nodes.append(tag_d)
            position += 1  
        
        for tag in element.findall("tag"):
            tag_d = {}
            if return_corrected_and_type(tag.attrib['k']) != None: 
                tag_d['id'] = way_attribs['id']
                tag_d['key'], tag_d['type'] = return_corrected_and_type(tag.attrib['k']) 
                
                if tag.attrib['k'] in KNOWNLISTTYPES:
                    taglist = list_data(tag.attrib['k'], tag.attrib['v'])
                    
                    for vals in taglist:
                        temp_d = {}
                        temp_d["id"] = tag_d["id"]
                        temp_d["type"] = tag_d["type"]
                        temp_d['key'], temp_d['value'] = cleanups(tag_d['key'], vals)
                        tags.append(temp_d)
                        
                elif tag_d['key'] in KNOWN_WAYDICTTYPES:
                    tagdict = dict_data(tag_d['key'], tag.attrib['v'])
                    
                    for tagkey in tagdict:
                        temp_d = {}
                        temp_d["id"] = tag_d["id"]
                        temp_d["type"] = tag_d["type"]
                        temp_d['key'], temp_d['value'] = cleanups(tagkey, tagdict[tagkey])
                        tags.append(temp_d)

                else:     
                    tag_d['value'] = tag.attrib['v']                
                
                    if cleanups(tag_d['key'], tag_d['value']) == None:
                        return None
                    else:
                        tag_d['key'], tag_d['value'] = cleanups(tag_d['key'], tag_d['value'])
                    tags.append(tag_d)
             
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}


# ================================================== #
#               Helper Functions                     #
# ================================================== #
## FROM UDACITY CASE STUDY
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'wb') as nodes_file, \
        codecs.open(NODE_TAGS_PATH, 'wb') as nodes_tags_file, \
        codecs.open(WAYS_PATH, 'wb') as ways_file, \
        codecs.open(WAY_NODES_PATH, 'wb') as way_nodes_file, \
        codecs.open(WAY_TAGS_PATH, 'wb') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=False)


In [34]:
## SQL IMPORT STEPS

DB_PATH = DIRECTORY + "berlin_germany.db"

SCHEMA = DIRECTORY + "schema.sql"

import sqlite3
import csv
import sys
db = sqlite3.connect(DB_PATH)

c = db.cursor()

##SCHEMA.sql SCRIPT UPDATED TO INCLUDE DROPS IN THE SCRIPT
with open(SCHEMA, "rb") as f:
    c.executescript(f.read())
    


In [35]:
# importing nodes
with open(NODES_PATH, "r") as f:
    reader = csv.DictReader(f)

    for row in reader:
        try:
            row["user"] = unicode(row["user"], 'utf-8')
            c.execute("insert into nodes values (?, ?, ?, ?, ?, ?, ?, ?)", (row[NODE_FIELDS[0]], row[NODE_FIELDS[1]], row[NODE_FIELDS[2]], row[NODE_FIELDS[3]], row[NODE_FIELDS[4]], row[NODE_FIELDS[5]], row[NODE_FIELDS[6]], row[NODE_FIELDS[7]]))
        except:
            print row
            print "Unexpected error:", sys.exc_info()
            break
    db.commit()

# importing nodes tags
with open(NODE_TAGS_PATH, "r") as f:
    reader = csv.DictReader(f)

    for row in reader:
        try:
            row["key"] = unicode(row["key"], 'utf-8')
            row["value"] = unicode(row["value"], 'utf-8')
            row["type"] = unicode(row["type"], 'utf-8')
            c.execute("insert into nodes_tags values (?, ?, ?, ?)", (row[NODE_TAGS_FIELDS[0]], row[NODE_TAGS_FIELDS[1]], row[NODE_TAGS_FIELDS[2]], row[NODE_TAGS_FIELDS[3]]))
        except:
            print row
            print "Unexpected error:", sys.exc_info()
            break
    db.commit()
    
# importing ways
with open(WAYS_PATH, "r") as f:
    reader = csv.DictReader(f)

    for row in reader:
        try:
            row["user"] = unicode(row["user"], 'utf-8')
            c.execute("insert into ways values (?, ?, ?, ?, ?, ?)", (row[WAY_FIELDS[0]], row[WAY_FIELDS[1]], row[WAY_FIELDS[2]], row[WAY_FIELDS[3]], row[WAY_FIELDS[4]], row[WAY_FIELDS[5]]))
        except:
            print row
            print "Unexpected error:", sys.exc_info()
            break
    db.commit()

# importing ways_tags tags
with open(WAY_TAGS_PATH, "r") as f:
    reader = csv.DictReader(f)

    for row in reader:
        try:
            row["key"] = unicode(row["key"], 'utf-8')
            row["value"] = unicode(row["value"], 'utf-8')
            row["type"] = unicode(row["type"], 'utf-8')
            c.execute("insert into ways_tags values (?, ?, ?, ?)", (row[WAY_TAGS_FIELDS[0]], row[WAY_TAGS_FIELDS[1]], row[WAY_TAGS_FIELDS[2]], row[WAY_TAGS_FIELDS[3]]))
        except:
            print row
            print "Unexpected error:", sys.exc_info()
            break
    db.commit()

# importing ways_nodes paths
with open(WAY_NODES_PATH, "r") as f:
    reader = csv.DictReader(f)

    for row in reader:
        try:
            c.execute("insert into ways_nodes values (?, ?, ?)", (row[WAY_NODES_FIELDS[0]], row[WAY_NODES_FIELDS[1]], row[WAY_NODES_FIELDS[2]]))
        except:
            print row
            print "Unexpected error:", sys.exc_info()
            break
    db.commit()