# 1. Data Audit

In [1]:
# %%writefile mapparser.py
#!/usr/bin/env python

import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
import collections

In [2]:
import os
datadir = "."
datafile = "DenverCO.osm"
denver_data = os.path.join(datadir, datafile)

OSM_FILE = "./DenverCO.osm"
SAMPLE_FILE = "DenverCO_sample.osm"

In [3]:
#Parse through the file with ElementTree and count the number of unique element types to understand overall structure.
def count_tags(filename):
        tags = {}
        print filename
        
        for event, elem in ET.iterparse(filename):
            
            if elem.tag in tags:
                tags[elem.tag] += 1
            else:
                tags[elem.tag] = 1
        return tags
    
tags = count_tags(SAMPLE_FILE)
pprint.pprint(tags)

DenverCO_sample.osm
{'member': 2117,
 'nd': 240131,
 'node': 200867,
 'osm': 1,
 'relation': 119,
 'tag': 132958,
 'way': 26565}


In [4]:
import re

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

def get_element(osm_file, tags=('node','way')):
     context = ET.iterparse(osm_file, events=('start','end'))
     _, root = next(context) 
     for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

def key_type(element, keys):
    if element.tag == "node" or element.tag == "way":
        for tag in element.iter('tag'):
            k = tag.get('k')
            if lower.search(k):
                keys['lower'] += 1
            elif lower_colon.search(k):
                keys['lower_colon'] += 1
            elif problemchars.search(k):
                keys['problemchars'] += 1
            else:
                keys['other'] += 1
    return keys


def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in enumerate(get_element(filename)):
        keys = key_type(element, keys)

    return keys

denver_keys = process_map(SAMPLE_FILE)
pprint.pprint(denver_keys)

{'lower': 83849, 'lower_colon': 46752, 'other': 1930, 'problemchars': 0}


In [5]:
#people invovlved in the map editing.
def process_map(filename):
    users = set()
    for __, element in ET.iterparse(filename):
        for e in element:
            if 'uid' in e.attrib:
                users.add(e.attrib['uid'])
                
    return users

users = process_map(SAMPLE_FILE)
len(users)

768

# 2. Problems Encountered

   # 2.1. Street Abbreviations

In [6]:
from collections import defaultdict

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ["Avenue", "Boulevard", "Commons", "Court", "Circle", "Drive", "Lane", "Parkway", 
                         "Place", "Road", "Square", "Street", "Trail", "Way"]

mapping = {'Ave'  : 'Avenue',
           'Ave.' : 'Avenue',
           'Blvd' : 'Boulevard',
           'Cir'   : 'Circle',
           'Dr'   : 'Drive',
           'Ln'   : 'Lane',
           'Pkwy' : 'Parkway',
           'Pl'   : 'Place',
           'Rd'   : 'Road',
           'Rd.'   : 'Road',
           'St'   : 'Street',
           'St.'  : 'Street',
           'street' : 'Street',
           'Ct'   : 'Court',
           'Cir'  : 'Circle',
           'Cr'   : 'Court',
           'ave'  : 'Avenue',
           'Hwg'  : 'Highway',
           'Hwy'  : 'Highway',
           'Wy'   : 'Way',
           'Sq'   : "Square"}

In [7]:
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for elem in get_element(osm_file, tags=('node','way')):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])

    return street_types

In [8]:
denver_street_types = audit(SAMPLE_FILE)

In [9]:
pprint.pprint(dict(denver_street_types))

{'100': set(['Sheridan Boulevard #100']),
 '2': set(['Colorado SH 2', 'Colorado SR 2']),
 '287': set(['US Highway 287']),
 'Ave': set(['E 6th Ave',
             'E 72nd Ave',
             'E Caley Ave',
             'E Fair Ave',
             'E Maplewood Ave',
             'W 84th Ave']),
 'Ave.': set(['W. Alameda Ave.']),
 'Blvd': set(['745 Colorado Blvd',
              'East Academy Blvd',
              'Federal Blvd',
              'Green Valley Ranch Blvd',
              'S University Blvd',
              'South Colorado Blvd',
              'Wadsworth Blvd',
              'Wadworth Blvd']),
 'Broadway': set(['Broadway', 'South Broadway']),
 'Center': set(['Garden Center']),
 'Cir': set(['E Flatiron Cir', 'S Lake Cir']),
 'Colfax': set(['East Colfax']),
 'Crescent': set(['Interlocken Crescent']),
 'Ct': set(['S Niagra Ct']),
 'Dr': set(['Community Circle Dr']),
 'Highway': set(['North Valley Highway']),
 'Lincoln': set(['Lincoln']),
 'Pl': set(['E Maplewood Pl', 'E Orchard Pl']),


In [10]:
def update_name(name, mapping, regex):
    m = regex.search(name)
    if m:
        street_type = m.group()
        if street_type in mapping:
            name = re.sub(regex, mapping[street_type], name)

    return name

for street_type, ways in denver_street_types.iteritems():
    for name in ways:
        better_name = update_name(name, mapping, street_type_re)
        print name, "=>", better_name

East Colfax => East Colfax
Park Avenue West => Park Avenue West
Pearl St. => Pearl Street
Via Varra => Via Varra
E Arapahoe Rd => E Arapahoe Road
Coalton Rd => Coalton Road
S Parker Rd => S Parker Road
North Valley Highway => North Valley Highway
Colorado SH 2 => Colorado SH 2
Colorado SR 2 => Colorado SR 2
E Orchard Pl => E Orchard Place
E Maplewood Pl => E Maplewood Place
Community Circle Dr => Community Circle Drive
US Highway 287 => US Highway 287
Lincoln => Lincoln
W. Alameda Ave. => W. Alameda Avenue
Garden Center => Garden Center
Tennyson => Tennyson
S Sherman St => S Sherman Street
S Clayton St => S Clayton Street
Wright St => Wright Street
S Poplar St => S Poplar Street
South Grant St => South Grant Street
Dayton St => Dayton Street
Wewatta St => Wewatta Street
S Newport St => S Newport Street
S Elizabeth St => S Elizabeth Street
S Peoria St => S Peoria Street
S Broadway St => S Broadway Street
S Josephine St => S Josephine Street
S Columbine St => S Columbine Street
E Flatiro

   # 2.2. Zip Code

In [11]:
from collections import defaultdict

def audit_zipcode(invalid_zipcodes, zipcode):
    twoDigits = zipcode[0:2]
    
    if not twoDigits.isdigit():
        invalid_zipcodes[twoDigits].add(zipcode)
    
    elif twoDigits != 80:
        invalid_zipcodes[twoDigits].add(zipcode)
        
def is_zipcode(elem):
    return (elem.attrib['k'] == "addr:postcode")

def audit_zip(osmfile):
    osm_file = open(osmfile, "r")
    invalid_zipcodes = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_zipcode(tag):
                    audit_zipcode(invalid_zipcodes,tag.attrib['v'])

    return invalid_zipcodes

denver_zipcode = audit_zip(SAMPLE_FILE)

In [12]:
pprint.pprint(dict(denver_zipcode))

{'80': set(['80002',
            '80003',
            '80004',
            '80010',
            '80011',
            '80012',
            '80013',
            '80014',
            '80015',
            '80016',
            '80017',
            '80018',
            '80020',
            '80021',
            '80022',
            '80023',
            '80027',
            '80030',
            '80031',
            '80033',
            '80045',
            '80110',
            '80111',
            '801111',
            '80112',
            '80113',
            '80120',
            '80121',
            '80122',
            '80123',
            '80127',
            '80202',
            '80203',
            '80204',
            '80205',
            '80206',
            '80207',
            '80209',
            '80210',
            '80211',
            '80212',
            '80214',
            '80214-1801',
            '80214-1833',
            '80215',
            '80216',
            '80218',
  

In [13]:
def update_name(zipcode):
    testNum = re.findall('[a-zA-Z]*', zipcode)
    if testNum:
        testNum = testNum[0]
    testNum.strip()
    if testNum == "CO":
        convertedZipcode = (re.findall(r'\d+', zipcode))
        if convertedZipcode:
            if convertedZipcode.__len__() == 2:
                return (re.findall(r'\d+', zipcode))[0] + "-" +(re.findall(r'\d+', zipcode))[1]
            else:
                return (re.findall(r'\d+', zipcode))[0]

for street_type, ways in denver_zipcode.iteritems():
    for name in ways:
        better_name = update_name(name)
        print name, "=>", better_name

80214-1801 => None
80122 => None
801111 => None
80241 => None
80238 => None
80640 => None
80002 => None
80601 => None
80221 => None
80220 => None
80223 => None
80222 => None
80045 => None
80224 => None
80209 => None
80226 => None
80207 => None
80206 => None
80205 => None
80204 => None
80203 => None
80202 => None
80015 => None
80249 => None
80127 => None
80023 => None
80022 => None
80021 => None
80020 => None
80004 => None
80260 => None
80003 => None
80121 => None
80232 => None
80123 => None
80031 => None
80214-1833 => None
80247 => None
80113 => None
80219 => None
80017 => None
80111 => None
80027 => None
80218 => None
80233 => None
80230 => None
80231 => None
80030 => None
80237 => None
80234 => None
80235 => None
80210 => None
80211 => None
80212 => None
80239 => None
80214 => None
80215 => None
80216 => None
80246 => None
80229 => None
80033 => None
80120 => None
80018 => None
80602 => None
80016 => None
80112 => None
80014 => None
80110 => None
80012 => None
80013 => None
80010 => 

# 3. Convert from XML to JSON

In [14]:
import re
import codecs
import json

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
address_regex = re.compile(r'^addr\:')
street_regex = re.compile(r'^street')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way" :
        # YOUR CODE HERE
        node['type'] = element.tag
        # initialize empty address
        address = {}
        # parsing through attributes
        for a in element.attrib:
            if a in CREATED:
                if 'created' not in node:
                    node['created'] = {}
                node['created'][a] = element.get(a)
            elif a in ['lat', 'lon']:
                continue
            else:
                node[a] = element.get(a)
        # populate position
        if 'lat' in element.attrib and 'lon' in element.attrib:
            node['pos'] = [float(element.get('lat')), float(element.get('lon'))]

        # parse second-level tags for nodes
        for e in element:
            # parse second-level tags for ways and populate `node_refs`
            if e.tag == 'nd':
                if 'node_refs' not in node:
                    node['node_refs'] = []
                if 'ref' in e.attrib:
                    node['node_refs'].append(e.get('ref'))

            # throw out not-tag elements and elements without `k` or `v`
            if e.tag != 'tag' or 'k' not in e.attrib or 'v' not in e.attrib:
                continue
            key = e.get('k')
            val = e.get('v')

            # skip problematic characters
            if problemchars.search(key):
                continue

            # parse address k-v pairs
            elif address_regex.search(key):
                key = key.replace('addr:', '')
                address[key] = val

            # catch-all
            else:
                node[key] = val
        # compile address
        if len(address) > 0:
            node['address'] = {}
            street_full = None
            street_dict = {}
            street_format = ['prefix', 'name', 'type']
            # parse through address objects
            for key in address:
                val = address[key]
                if street_regex.search(key):
                    if key == 'street':
                        street_full = val
                    elif 'street:' in key:
                        street_dict[key.replace('street:', '')] = val
                else:
                    node['address'][key] = val
            # assign street_full or fallback to compile street dict
            if street_full:
                node['address']['street'] = street_full
            elif len(street_dict) > 0:
                node['address']['street'] = ' '.join([street_dict[key] for key in street_format])
        return node
    else:
        return None


def process_map(file_in, pretty = False):
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data
process_map(SAMPLE_FILE)

[{'created': {'changeset': '28505098',
   'timestamp': '2015-01-30T13:47:30Z',
   'uid': '2237750',
   'user': 'chachafish',
   'version': '4'},
  'id': '26251251',
  'pos': [39.9001128, -105.0807594],
  'type': 'node'},
 {'created': {'changeset': '28505237',
   'timestamp': '2015-01-30T13:50:58Z',
   'uid': '2237750',
   'user': 'chachafish',
   'version': '2'},
  'id': '26251287',
  'pos': [39.8794519, -105.0663903],
  'type': 'node'},
 {'created': {'changeset': '4641999',
   'timestamp': '2010-05-08T19:17:37Z',
   'uid': '139555',
   'user': 'DavidJDBA',
   'version': '4'},
  'id': '26280490',
  'pos': [39.8235601, -104.7459713],
  'type': 'node'},
 {'created': {'changeset': '29868003',
   'timestamp': '2015-03-31T06:07:07Z',
   'uid': '227972',
   'user': 'Your Village Maps',
   'version': '6'},
  'id': '27512380',
  'pos': [39.72954, -105.0151797],
  'type': 'node'},
 {'created': {'changeset': '30177046',
   'timestamp': '2015-04-13T02:39:30Z',
   'uid': '227972',
   'user': 'Your

# 4. Dataoverview with MongoDB

In [3]:
#!pip install pymongo
import signal
import subprocess
pro = subprocess.Popen('mongod', preexec_fn = os.setsid)

NameError: name 'os' is not defined