# OpenStreetMap Sample Project

# Data Wrangling with MongoDB

## Lucas Mwai

Map Area: Marietta, GA, United States

https://www.openstreetmap.org/export#map=12/33.9486/-84.5425
Overpass API node(-84.7169,34.0316,-84.3681,33.8656)

This project seeks to apply data Wrangling techniques to analyze and clean Open Street Map data for the selected area and then perform an exploratory analysis of the data after importing it into a MongoDB collection.
The code used in the case study scripts has been adopted here and modified to reflect my own osm file.


In [1]:
#import modules needed
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import os
import pprint
#indicate the file path and open the file
osm_file = open("C:\Users\CHEGE\Desktop\chicago\map.osm", "r")
filename = "map.osm"
path = "C:\Users\CHEGE\Desktop\chicago"
marietta_osm = os.path.join(path, filename)

First i will audit the street types and display the different variations of the names.

In [2]:
#open file  and display the street types
osm_file = open("C:\Users\CHEGE\Desktop\chicago\map.osm", "r")

street_type_re = re.compile(r'\S+\.?$', re.IGNORECASE)
street_types = defaultdict(int)

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()

        street_types[street_type] += 1

def print_sorted_dict(d):
    keys = d.keys()
    keys = sorted(keys, key=lambda s: s.lower())
    for k in keys:
        v = d[k]
        print "%s: %d" % (k, v) 

def is_street_name(elem):
    return (elem.tag == "tag") and (elem.attrib['k'] == "addr:street")

def audit():
    for event, elem in ET.iterparse(osm_file):
        if is_street_name(elem):
            audit_street_type(street_types, elem.attrib['v'])    
    print_sorted_dict(street_types)    

if __name__ == '__main__':
    audit()

#100: 1
#1012: 1
#108: 1
#125E: 1
#212: 1
#240: 1
#270: 1
#450: 1
#603: 1
#A: 1
#G: 1
4015: 2
41: 1
710: 1
Approach: 19
Avenue: 9
Boulevard: 6
Circle: 159
Close: 8
Connector: 1
Court: 279
Crossing: 50
Cut: 1
Dale: 6
Dr: 1
Drive: 927
Extension: 1
Glen: 12
Highway: 42
Hill: 10
Hollow: 16
Knoll: 18
Landing: 5
Lane: 351
Mall: 3
NE: 19
North: 47
Northeast: 4677
Northwest: 6147
NW: 3
Overlook: 10
Parkway: 124
Path: 14
Pkwy: 2
Place: 162
Rd: 3
Ridge: 1
Road: 690
Run: 31
SE: 2
South: 115
Southeast: 1102
Southwest: 235
Square: 8
Street: 133
Terrace: 18
Trace: 230
Track: 36
Trail: 76
Walk: 26
Way: 257


The variation of street names is displayed above, including  full names, abbreviations, the (#) sign plus numbers and thr (#) sigh and letters. i will standardize the street names and use fullnmaes inplace of abbreviations and remove the characters and numbers on the end to ensure consistency.

In [3]:
#import required modules
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "map.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
#expected street names no change required if street name is in this list 
expected = ["Approach", "Avenue", "Boulevard","Circle","Close","Connector","Court","Crossing","Dale", 
            "Drive", "Extension","Glen","Highway","Hill","Hollow","Knoll","Landing","Lane","Mall",
            "North","Northeast","Northwest","Overlook","Parkway", "Path","Place", "Ridge","Road","Run",
            "South","Southeast","Southwest","Square","Street","Terrace","Trace", "Track", 
            "Trail", "Walk", "Way"]
#to update the abbreviations with replacements and remove extra characters from street names
mapping = { "Dr": "Drive",
            "NE" : "Northeast",
            "NW": "Northwest",
            "Rd": "Road",
            "Pkwy": "Parkway",
            "SE": "Southeast",
            "Ernest W Barrett Pkwy NW #100": "Ernest W Barrett Parkway Northwest",
            "Williams Dr #1012":"Williams Drive",
            "Williams Drive #108":"Williams Drive",
            "Johnson Ferry Rd #125E":"Johnson Ferry Road",
            "Powers Ferry Rd SE #212":"Powers Ferry Road Southeast",
            "Roswell Rd #240":"Roswell Road",
            "Roswell Rd Northeast":"Roswell Road Northeast",
            "Cumberland Blvd Southeast":"Cumberland Boulevard Southeast",
            "Cobb International Drive #270":"Cobb International Drive",
            "Ernest W Barrett Parkway Suite 4015":"Ernest W Barrett Parkway",
            "Roswell Rd #603": "Roswell Road",
            "Johnson Ferry Rd #450":"Johnson Ferry Road",
            "Dallas Hwy Sw Ste 710":"Dallas Highway Southwest",
            "Powers Ferry Rd SE #A":"Powers Ferry Road",
            "Canton Rd #G":"Canton Road"
          }


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types


def update_name(name, mapping):
     sorted_keys = sorted(mapping.keys(), key=len, reverse=True)
     for abbrv in sorted_keys:
         if(abbrv in name):
             return name.replace(abbrv, mapping[abbrv])
     return name


def test():
    st_types = audit(OSMFILE)
    #assert len(st_types) == 3
    pprint.pprint(dict(st_types))

    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            print name, "=>", better_name
      


if __name__ == '__main__':
    test()

{'100': set(['Ernest W Barrett Pkwy NW #100']),
 '1012': set(['Williams Dr #1012']),
 '108': set(['Williams Drive #108']),
 '125E': set(['Johnson Ferry Rd #125E']),
 '212': set(['Powers Ferry Rd SE #212']),
 '240': set(['Roswell Rd #240']),
 '270': set(['Cobb International Drive #270']),
 '4015': set(['Ernest W Barrett Parkway Suite 4015']),
 '41': set(['Old Highway 41']),
 '450': set(['Johnson Ferry Rd #450']),
 '603': set(['Roswell Rd #603']),
 '710': set(['Dallas Hwy Sw Ste 710']),
 'A': set(['Powers Ferry Rd SE #A']),
 'Cut': set(['Intrepid Cut']),
 'Dr': set(['Fambrough Dr']),
 'G': set(['Canton Rd #G']),
 'NE': set(['Roswell Rd NE', 'Roswell Road NE', 'Sandy Plains Ind Pky NE']),
 'NW': set(['Mars Hill Road NW', 'McCollum Parkway NW']),
 'Pkwy': set(['Cimarron Pkwy', 'Earnst W. Barrett Pkwy']),
 'Rd': set(['Roswell Rd', 'Vaughn Rd']),
 'SE': set(['Cumberland Blvd SE', 'Spring Hill Parkway SE'])}
Powers Ferry Rd SE #A => Powers Ferry Road
Cobb International Drive #270 => Cobb Inte

The list of the expected names is in the first dictionary, then the updated elements are in the mapping. The changes in street names is made as shown with the old name and the new updated name.

In [4]:
#preview of street names after the cleaning
st_types = audit(marietta_osm)
for st_type, ways in st_types.iteritems():
    for name in ways:        
        name = update_name(name, mapping)
        print name

Powers Ferry Road
Cobb International Drive
Johnson Ferry Road
Powers Ferry Road Southeast
Canton Road
Roswell Road
Johnson Ferry Road
Sandy Plains Ind Pky Northeast
Roswell Rd Northeast
Roswell Road Northeast
Ernest W Barrett Parkway
Cimarron Parkway
Earnst W. Barrett Parkway
Roswell Road
Vaughn Road
Old Highway 41
Williams Drive
Intrepid Cut
Roswell Road
Cumberland Blvd Southeast
Spring Hill Parkway Southeast
Ernest W Barrett Parkway Northwest
Dallas Highway Southwest
Fambrough Drive
Williams Drive
McCollum Parkway Northwest
Mars Hill Road Northwest


A display of a  preview of a clean version that will be written to the database

After corrections are made the files are converted to json formart before writing them to the database

In [5]:
#import modules
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
#shape elements

def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way" :
       
        lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = ["version", "changeset", "timestamp", "user", "uid"]
ATTRIB = ["id", "visible", "amenity", "cuisine", "name", "phone"]

def shape_element(element):
    
    if element.tag == 'node' or element.tag == 'way':

        # Add empty created dictionary and k/v = type: node/way
        node = {'created': {}, 'type': element.tag}

        # Update pos array with lat and lon
        if 'lat' in element.attrib and 'lon' in element.attrib:
            node['pos'] = [float(element.attrib['lat']), float(element.attrib['lon'])]

        # Deal with node and way attributes
        for k in element.attrib:

            if k == 'lat' or k == 'lon':
                continue
            if k in CREATED:
                node['created'][k] = element.attrib[k]
            else:
                # Add direct key/value items of node/way
                node[k] = element.attrib[k]

        # Deal with second level tag items
        for tag in element.iter('tag'):
            k = tag.attrib['k']
            v = tag.attrib['v']

            # Search for problem characters in 'k' and ignore them
            if problemchars.search(k):
                # Add to array to print out later
                continue
            elif k.startswith('addr:'):
                address = k.split(':')
                if len(address) == 2:
                    if 'address' not in node:
                        node['address'] = {}
                    node['address'][address[1]] = v
            else:
                node[k] = v

        # Add key/value node ref from way
        node_refs = []
        for nd in element.iter('nd'):
            node_refs.append(nd.attrib['ref'])

        if len(node_refs) > 0:
            node['node_refs'] = node_refs

        return node
    else:
        return None



def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

def test():
   
    data = process_map('map.osm', True)
    #pprint.pprint(data)
 

if __name__ == "__main__":
    test()

The cleaned json file if ready to be written to the database