#Data Wrangling with MongoDB

###This gives us some general details on the Mobile, Alabama dataset

In [6]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Iterative parsing to process the map file and find out tags and counts."""
import xml.etree.ElementTree as ET
import pprint

filename='mobile_alabama.osm'

def count_tags(filename):
    tags={}
    for event, elem in ET.iterparse(filename):
        tag=elem.tag
        if tag not in tags.keys():
            tags[tag]=1
        else:
            tags[tag]+=1
    return tags

tags = count_tags(filename)
pprint.pprint(tags)   

{'bounds': 1,
 'member': 1461,
 'nd': 287739,
 'node': 239873,
 'osm': 1,
 'relation': 117,
 'tag': 256925,
 'way': 32687}


In [7]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.ElementTree as ET
import pprint
import re
""""k" value for each "<tag>" and see if they can be valid keys in MongoDB check for other potential problems."""
filename='mobile_alabama.osm'

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

def key_type(element, keys):
    if element.tag == "tag":
        k=element.attrib['k']
        if lower.search(k):
            keys['lower']+=1
        elif lower_colon.search(k):
            keys['lower_colon']+=1
        elif problemchars.search(k):
            keys['problemchars']+=1
        else:
            keys['other']+=1
    return keys

def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

keys = process_map(filename)
pprint.pprint(keys)

{'lower': 69399, 'lower_colon': 183560, 'other': 3966, 'problemchars': 0}


This tells us no problem characters found this dataset, so it can be imported into MongoDB

In [36]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.ElementTree as ET
import pprint
import re
"""Unique users have contributed to the map in this particular area!"""
filename='mobile_alabama.osm'

def get_user(element):
    user=element.attrib['user']
    return user

def process_map(filename):
    users = set()
    for event, element in ET.iterparse(filename):
        try:
            users.add(get_user(element))
        except KeyError:
            continue
    return users

users = process_map(filename)
print "Count of users: ", len(users)
print users
pprint.pprint(users)



Count of users:  156
set(['Andre68', 'Matthias Buchmeier', "Mike O'Risey", 'amillar', 'Sundance', 'Brian@Brea', 'Clorox', 'adraladmin', 'hfyu', 'OSMF Redaction Account', 'Dutchie-in-Mobile', 'Mark Gray', 'California Bear', 'xybot', 'Chris Lawrence', 'Oberaffe', 'stucki1', 'robhedrick', 'Jochen Hein', 'Shanon', 'cgu66', 'T Hoffmann', 'andygol', 'BCNorwich', 'dbmercer', 'zephyr', 'Kirbert', 'pschonmann', 'ToffeHoff', 'maxerickson', 'skquinn', 'anbr', '25or6to4', 'RoadGeek_MD99', 'moosejaw', 'werner2101', 'jonesydesign', 'Skywave', 'andrewpmk', 'dcat', 'BugBuster', 'dannykath', 'Dami_D', 'vitus marton', 'StellanL', 'AndrewSnow', 'Maarten Deen', 'rumpelsocke', 'uk1967', 'LZimlich', 'rickmastfan67', 'Jano John Akim Franke', 'STA', 'Luke S', 'spursmjs777', 'Benny Butler', 'mvexel', 'adjuva', 'iandees', 'WanMil', 'Echo Echo', 'landfahrer', 'oldtopos', 'wambacher', 'Dimitrii', 'daviskingdom', 'isabellekh', 'Jonathan ZHAO', 'smsm1', 'Claudius Henrichs', 'ToeBee', 'oldenburg69', 'tcekolin', 'god

#Auditing the names

In [43]:
"""
- audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix 
    the unexpected street types to the appropriate ones in the expected list.
    You have to add mappings only for the actual problems you find in this OSMFILE,
    not a generalized solution, since that may and will depend on the particular area you are auditing.
- write the update_name function, to actually fix the street name.
    The function takes a string with street name as an argument and should return the fixed name
    We have provided a simple test so that you see what exactly is expected
"""
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "mobile_alabama.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = []
#expected=["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", "Trail", "Parkway", "Commons"]

mapping = { "Ave": "Avenue",
            "Blvd": "Boulevard",
            "Dr": "Drive",
            "Rd": "Road",
            }


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    pprint.pprint(dict(street_types))
    

def update_name(name, mapping):
    find=street_type_re.search(name)
    if name:
        checkmap=mapping[find.group()]
        name=street_type_re.sub(checkmap, name)
    return name


def test():
    st_types = audit(OSMFILE)
    print st_types
#    pprint.pprint(dict(st_types))

    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            print name, "=>", better_name


audit(OSMFILE)

{'Ave': set(['Holcombe Ave', 'S Mobile Ave']),
 'Avenue': set(['North Washington Avenue', 'Spring Hill Avenue']),
 'Blvd': set(['Airport Blvd']),
 'Boulevard': set(['Airport Boulevard', 'Eastern Shore Boulevard']),
 'Court': set(['Green Court', 'Southern Way Court']),
 'Dr': set(['Grishilde Dr', 'Yacht Club Dr']),
 'Drive': set(['Bass Pro Drive',
               'Dunlap Drive',
               'Gaillard Drive',
               'Golf Way Drive',
               'Museum Drive']),
 'Highway': set(['North Craft Highway']),
 'Laurel': set(['Laurel']),
 'Rd': set(['Old Shell Rd']),
 'Road': set(['Addsco Road',
              'Cody Road',
              'Howells Ferry Road',
              'North Beach Road',
              'Old Shell Road']),
 'South': set(['Schillinger Road South']),
 'Street': set(['Dauphin Street',
                'Government Street',
                'Saint Francis Street',
                'South Broad Street',
                'South Claiborne Street']),
 'Trail': set(['Old Spani

#XML to JSON for MongoDB

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.ElementTree as ET
import pprint
import re
import codecs
import json


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


def shape_element(element):
    node = {}
#Process only 2 types of top level tags: "node" and "way"
    if element.tag == "node" or element.tag == "way" :
        for key in element.attrib.keys():
            elatkey = element.attrib[key]
            node["type"] = element.tag
            
#Attributes in the CREATED array should be added under a key "created"
            if key in CREATED:
                if not "created" in node.keys():
                    node["created"] = {}
                node["created"][key] = elatkey
                
#Attributes for latitude and longitude should be added to a "pos" array, for use in geospacial indexing. 
            elif key == "lon" or key == "lat":
                if not "pos" in node.keys():
#Values inside "pos" array are floats and not strings. 
                    node["pos"] = [0.0, 0.0]
                oldPOS = node["pos"]
                if key == "lat":
                    newPOS = [float(elatkey), oldPOS[1]]
                else:
                    newPOS = [oldPOS[0], float(elatkey)]
                node["pos"] = newPOS
            else:
                node[key] = elatkey
            
            for tag in element.iter("tag"):
                tkey = tag.attrib['k']
                tval = tag.attrib['v']
#- if second level tag "k" value contains problematic characters, it should be ignored
                if problemchars.match(tkey):
                    continue
            
#- if second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
                elif tkey.startswith("addr:"):
                    if not "address" in node.keys():
                        node["address"] = {}

#- if second level tag "k" value does not start with "addr:", but contains ":", you can process it same as any other tag.
                    akey = tkey[len("addr:") : ]
                    if lower_colon.match(akey):
                        continue
                    else:
                        node["address"][akey] = tval         
                        
#- if there is a second ":" that separates the type/direction of a street, the tag should be ignored, 
                elif lower_colon.match(tkey):
                    node[tkey] = tval
                else:
                    node[tkey] = tval
                    
        for tag in element.iter("nd"):
            if not "node_refs" in node.keys():
                node["node_refs"] = []
            node_refs = node["node_refs"]
            node_refs.append(tag.attrib["ref"])
            node["node_refs"] = node_refs

        return node
    else:
        return None


def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

def test():
    # NOTE: if you are running this code on your computer, with a larger dataset, 
    # call the process_map procedure with pretty=False. The pretty=True option adds 
    # additional spaces to the output, making it significantly larger.
    data = process_map('mobile_alabama.osm', False)
    pprint.pprint(data)
    
    correct_first_elem = {
        "id": "261114295", 
        "visible": "true", 
        "type": "node", 
        "pos": [41.9730791, -87.6866303], 
        "created": {
            "changeset": "11129782", 
            "user": "bbmiller", 
            "version": "7", 
            "uid": "451048", 
            "timestamp": "2012-03-28T18:31:23Z"
        }
    }
    assert data[0] == correct_first_elem
    assert data[-1]["address"] == {
                                    "street": "West Lexington St.", 
                                    "housenumber": "1412"
                                      }
    assert data[-1]["node_refs"] == [ "2199822281", "2199822390",  "2199822392", "2199822369", 
                                    "2199822370", "2199822284", "2199822281"]

if __name__ == "__main__":
    test()

[{'created': {'changeset': '2196450',
              'timestamp': '2009-08-19T03:00:21Z',
              'uid': '147510',
              'user': 'woodpeck_fixbot',
              'version': '2'},
  'id': '51845580',
  'pos': [30.544867, -87.897021],
  'type': 'node'},
 {'created': {'changeset': '2935812',
              'timestamp': '2009-10-24T10:23:51Z',
              'uid': '147510',
              'user': 'woodpeck_fixbot',
              'version': '2'},
  'id': '51845584',
  'pos': [30.545583, -87.895362],
  'type': 'node'},
 {'created': {'changeset': '3095171',
              'timestamp': '2009-11-12T01:59:29Z',
              'uid': '147510',
              'user': 'woodpeck_fixbot',
              'version': '2'},
  'id': '51845586',
  'pos': [30.545559, -87.895275],
  'type': 'node'},
 {'created': {'changeset': '2788963',
              'timestamp': '2009-10-09T00:33:36Z',
              'uid': '147510',
              'user': 'woodpeck_fixbot',
              'version': '2'},
  'id': '5184

In [None]:
from pymongo import MongoClient
import pprint

client=MongoClient('mongodb://localhost:27017/')
db=client.openstreetmap

def find():
    query=db.mobile.find({"type":"way"})
    
    for a in query:
        pprint.pprint(a)
            
find()