In [1]:
from IPython.display import display_html
display_html("""<button onclick="$('.input, .prompt, .output_stderr, .output_error').toggle();">Toggle Code</button>""", raw=True)

# OpenStreetMap
***Data Wrangling with mongoDB by NK Zhehua Zou***
  
Map Area: San Jose, CA, United States  
https://mapzen.com/data/metro-extracts/metro/san-jose_california/  
  
***Table of Contents***
1. Data Audit
2. Problems Encountered in the Map  
Abbreviated Street Names  
Postal Codes  
3. Data Overview  
4. Additional Ideas  
Contributor statistics and gamification suggestion  
Additional data exploration using MongoDB  
5. Conclusion

# 1. Data Audit

In [2]:
# Load packages and libraries
import sys
sys.path.append("script/")
import xml.etree.cElementTree as ET
import re

### cleaning ###
from collections import defaultdict
import string

### osm to json ###

from pymongo import MongoClient
import schem
import os
import codecs
import json

In [3]:
# Load data
# This data just a sample for code testing, I didn't change analysis from original data
# Please read html file if you want to reviewed entire analysis.

data = 'data/sample.osm'

### Tags
Parse through the San Jose dataset with ElementTree and count the number of unique element types to get an overall understanding of the data by using count_tags function.

In [4]:
# This function will takes 5~10 seconds. Be patient on on next step.
# Parse through the data with ElementTree.
def count_tags(data):
    tags={}
    for event, elem in ET.iterparse(data):
        if elem.tag in tags:
            tags[elem.tag]+=1
        else:
            tags[elem.tag]=1
    return tags

In [5]:
count_tags(data)

{'bounds': 1,
 'member': 115,
 'nd': 21564,
 'node': 13332,
 'osm': 1,
 'relation': 21,
 'tag': 7128,
 'way': 3721}

### Keys Type
*** For the follinwg function: key_type & process_map. We check the "k" value for each. ***  
"lower", for tags that contain only lowercase letters and are valid.  
"lower_colon", for otherwise valid tags with a colon in their names.  
"problemchars", for tags with problematic characters.

In [6]:
# Count of each of three tag categories in a dictionary with re.
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
# This regex represents invalid MongoDB characters for keys.
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

def key_type(element, keys):
    if element.tag == 'tag':
        if re.match(lower,element.get('k'))!=None:
            keys['lower']+=1
        elif re.match(lower_colon,element.get('k'))!=None:
            keys['lower_colon']+=1
        elif re.match(problemchars,element.get('k'))!=None:
            keys['problemchars']+=1
        else:
            keys['other']+=1
    return keys

def process_key(data):
    keys = {'lower': 0, 'lower_colon': 0, 'problemchars': 0, 'other': 0}
    for _, element in ET.iterparse(data):
        keys = key_type(element, keys)
    return keys

In [7]:
process_key(data)

{'lower': 6751, 'lower_colon': 363, 'other': 14, 'problemchars': 0}

### Users

In [8]:
def process_people(data):
    users = set()
    for _, element in ET.iterparse(data):
        for e in element:
            if 'uid' in e.attrib:
                users.add(e.attrib['uid'])
    return users

print str(len(process_people(data))) + ' peoples invovlved in the map editing.'

94 peoples invovlved in the map editing.


# 2. Problems Encountered in the Map
After initially downloading a small sample size of the San Jose area and running it, I noticed three main problems with the data, which I will discuss in the following order:  
1) Abbreviated street names ('Branham Ln')  
2) Inconsistent postal codes ('CA950543', '95014-1899')

### Abbreviated Street Names
Once the data was imported to MongoDB, some basic querying revealed street name abbreviations. I updated all substrings in problematic address strings, such that 'Branham Ln' becomes 'Branham Lane'.

In [9]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
street_abbrev_re = re.compile(r'^([a-z]){1}\.?(\s)+', re.IGNORECASE)
expected = ['Avenue', 'Boulevard', 'Commons', 'Court', 'Drive', 'Lane', 'Parkway', 
                         'Place', 'Road', 'Square', 'Street', 'Trail']
mapping_street = {'Ave'  : 'Avenue',
           'Blvd' : 'Boulevard',
           'Dr'   : 'Drive',
           'Ln'   : 'Lane',
           'Pkwy' : 'Parkway',
           'Rd'   : 'Road',
           'Rd.'   : 'Road',
           'St'   : 'Street',
           'street' :'Street',
           'Ct'   : 'Court',
           'Cir'  : 'Circle',
           'Cr'   : 'Court',
           'ave'  : 'Avenue',
           'Hwg'  : 'Highway',
           'Hwy'  : 'Highway',
           'Sq'   : 'Square'}

mapping_abbrev = { 'W ': 'West ', 'S ': 'South ', 'N ': 'North ', 'E ': 'East ',\
                   'W. ': 'West ', 'S. ': 'South', 'N. ': 'North ', 'E. ': 'East '}

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

def is_street_name(elem):
    return (elem.tag == 'tag') and (elem.attrib['k'] == 'addr:street')

def audit(osmfile):
    osm_file = open(osmfile, 'r')
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=('start',)):
        if elem.tag == "node" or elem.tag == 'way':
            for tag in elem.iter('tag'):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    return street_types
	
def update_street(name, mapping_street, mapping_abbrev):
    m = street_type_re.search(name)
    if m:
        street_type = m.group()
        if street_type in mapping_street:
            name = re.sub(regex, mapping[street_type], name)
    # Updating W , E, N, S to West, East, North, South; if they are at the begining of an address.
    m_1 = street_abbrev_re.search(name)
    if m_1:
        street_abbrev = m_1.group()
        if street_abbrev in mapping_abbrev.keys():
            name = re.sub(street_abbrev, mapping_abbrev[street_abbrev], name)
    # capitalizing first letter of all words in problematic address
    name = string.capwords(name)
    return name

1) The main problem we encountered in this dataset come from the street name abbreviation inconsistency. We build the regex matching the last element in the string, where usually the street type is based. Then we come up with a list of mapping that need not to be cleaned.  
2) audit_street_type function search the input string for the regex. If there is a match and it is not within the 'expected' list, add the match as a key and add the string to the set.  
3) is_street_name function looks at the attribute k if k='addre:street'.  
4) audit functio will return the list that match previous two functions.  
5) After that, we would do a pretty print the output of the audit. With the list of all the abbreviated street types we can understand and fill-up our 'mapping' dictionary as a preparatio to convert these street name into proper form. (list of 1)  
6) update_name is the last step of the process, which take the old name and update them with a better name. (list of 2)

In [10]:
for street_type, ways in audit(data).iteritems():
    for name in ways:
        better_name = update_street(name, mapping_street, mapping_abbrev)
        print name, '=>', better_name

San Jose => San Jose
Arica => Arica
Las Diamelas => Las Diamelas
Maria Izaga => Maria Izaga
Elias Aguirre => Elias Aguirre
Avenida Luis Gonzales => Avenida Luis Gonzales
Yahuar Huaca => Yahuar Huaca
Simon Bolivar => Simon Bolivar
Salaverry => Salaverry
Av. Jose Balta => Av. Jose Balta
Leoncio Prado => Leoncio Prado
Conquista => Conquista
Av. Pedro Ruiz => Av. Pedro Ruiz
la Libertad => La Libertad
Francisco Cabrera => Francisco Cabrera
Husares de Junin => Husares De Junin
Alfonso Ugarte => Alfonso Ugarte
Paul Harris => Paul Harris
Loreto => Loreto
Manuel Seoane => Manuel Seoane
Congreso => Congreso
Juan Tomis Stack => Juan Tomis Stack
Los Amautas => Los Amautas
El Eden => El Eden
Panamericana norte => Panamericana Norte
Sáenz Peña => Sáenz Peña
Elvira Garcia Y Garcia => Elvira Garcia Y Garcia
Los Andes => Los Andes
Avenida Oriente => Avenida Oriente
Pasaje manuel seoane => Pasaje Manuel Seoane
eduardo mesa => Eduardo Mesa
Av. Angamos => Av. Angamos
Zona Industrial => Zona Industrial
Las

### Postal Codes
Postal code strings posed a different sort of problem, forcing a decision to strip all leading and trailing characters before and after the main 5-digit zip code. This effectually dropped all leading state characters (as in 'CA950543') and 4-digit zip code extensions following a hyphen ('95014-1899'). This 5-digit constriction benefits MongoDB aggregation calls on postal codes.  
1) Although most of the zip code is correct, there're still a lot of zip code with incorrect 5 digit formats. We will process it like update street name. (list of 1)  
2 )The output of the clean zip code is summarised below. There are the format of 5 digits. (list of 2)

In [11]:
def audit_zipcode(invalid_zipcodes, zipcode):
    twoDigits = zipcode[0:2]
    if not twoDigits.isdigit():
        invalid_zipcodes[twoDigits].add(zipcode)
    elif twoDigits != 95:
        invalid_zipcodes[twoDigits].add(zipcode)
        
def is_zipcode(elem):
    return (elem.tag == "tag") and (elem.attrib['k'] == 'addr:postcode')

def audit_zip(osmfile):
    osm_file = open(osmfile, 'r')
    invalid_zipcodes = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=('start',)):
        if elem.tag == 'node' or elem.tag == 'way':
            for tag in elem.iter('tag'):
                if is_zipcode(tag):
                    audit_zipcode(invalid_zipcodes,tag.attrib['v'])
    return invalid_zipcodes

In [12]:
def update_postal(zipcode):
    testNum = re.findall('[a-zA-Z]*', zipcode)
    if testNum:
        testNum = testNum[0]
    testNum.strip()
    if testNum == 'CA':
        convertedZipcode = (re.findall(r'\d+', zipcode))
        if convertedZipcode:
            return (re.findall(r'\d+', zipcode))[0]
    elif re.match(r'^95\d+', zipcode):
        return re.findall(r'\d{5}', zipcode)[0]

for street_type, ways in audit_zip(data).iteritems():
    for name in ways:
        better_name = update_postal(name)
        print name, '=>', better_name

140101 => None
14820 => None
074 => None


# 3. Data Overview
This section contains basic statistics about the dataset and the MongoDB queries used to gather them.  
  
### Preparing for MongoDB by converting XML to JSON
In order to transform the data from XML to JSON, we need to follow these rules:  
1) Process only 2 types of top level tags: "node" and "way"  
2) All attributes of "node" and "way" should be turned into regular key/value pairs, except:   attributes in the CREATED array should be added under a key "created", attributes for latitude and longitude should be added to a "pos" array, for use in geospacial indexing. Make sure the values inside "pos" array are floats and not strings.  
3) If second level tag "k" value contains problematic characters, it should be ignored  
4) If second level tag "k" value starts with "addr:", it should be added to a dictionary "address"  
5) If second level tag "k" value does not start with "addr:", but contains ":", you can process it same as any other tag.  
6) If there is a second ":" that separates the type/direction of a street, the tag should be ignored  
After all the cleaning and data transformation are done, we would use last function process_map and convert the file from XML into JSON format

In [13]:
client = MongoClient()
db=client.project

In [14]:
db.doc.drop()

In [15]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
address_regex = re.compile(r'^addr\:')
street_regex = re.compile(r'^street')

# loading schema from schemaa file; The schemaa file is placed in the same directory that this notebook is placed;
### I really don't know why have to use this for JSON ###
SCHEMA = schem.schema

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

def shape_element(element):
    for el_tag in element.iter('tag'):
        key_tag = el_tag.attrib['k']
        # for cleaning data from problematic characters
        if problemchars.search(key_tag):
            continue
        # Fixing Street names
        if is_street_name(el_tag):
            el_tag.attrib['v'] = update_street(el_tag.attrib['v'], mapping_street, mapping_abbrev)
        
    node = {}
    if element.tag == "node" or element.tag == "way" :
        node['type'] = element.tag
        # initialize empty address
        address = {}
        # parsing through attributes
        for a in element.attrib:
            if a in CREATED:
                if 'created' not in node:
                    node['created'] = {}
                node['created'][a] = element.get(a)
            elif a in ['lat', 'lon']:
                continue
            else:
                node[a] = element.get(a)
        # populate position
        if 'lat' in element.attrib and 'lon' in element.attrib:
            node['pos'] = [float(element.get('lat')), float(element.get('lon'))]

        # parse second-level tags for nodes
        for e in element:
            # parse second-level tags for ways and populate `node_refs`
            if e.tag == 'nd':
                if 'node_refs' not in node:
                    node['node_refs'] = []
                if 'ref' in e.attrib:
                    node['node_refs'].append(e.get('ref'))

            # throw out not-tag elements and elements without `k` or `v`
            if e.tag != 'tag' or 'k' not in e.attrib or 'v' not in e.attrib:
                continue
            key = e.get('k')
            val = e.get('v')

            # skip problematic characters
            if problemchars.search(key):
                continue

            # parse address k-v pairs
            elif address_regex.search(key):
                key = key.replace('addr:', '')
                address[key] = val

            # catch-all
            else:
                node[key] = val
        # compile address
        if len(address) > 0:
            node['address'] = {}
            street_full = None
            street_dict = {}
            street_format = ['prefix', 'name', 'type']
            # parse through address objects
            for key in address:
                val = address[key]
                if street_regex.search(key):
                    if key == 'street':
                        street_full = val
                    elif 'street:' in key:
                        street_dict[key.replace('street:', '')] = val
                else:
                    node['address'][key] = val
            # assign street_full or fallback to compile street dict
            if street_full:
                node['address']['street'] = street_full
            elif len(street_dict) > 0:
                node['address']['street'] = ' '.join([street_dict[key] for key in street_format])
        return node
    else:
        return None

In [16]:
def process_map(file_in, pretty = False):
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

db.doc.insert_many(process_map(data));

### File sizes

In [17]:
print 'The original OSM file is ' + str(os.path.getsize(data)/1.0e6) + ' MB'

The original OSM file is 3.329866 MB


In [18]:
print 'The JSON file is ' + str(os.path.getsize(data + '.json')/1.0e6) + ' MB'

The JSON file is 3.795993 MB


In [19]:
# Number of documents, we defined it for next section.
number_document = db.doc.find().count()
print 'The number of documents is ' + str(number_document)

The number of documents is 17053


In [20]:
# Number of nodes
print 'The number of node is ' + str(db.doc.find({'type':'node'}).count())

The number of node is 13332


In [21]:
# Number of ways
print 'The number of way is ' + str(db.doc.find({'type':'way'}).count())

The number of way is 3721


In [22]:
# Number of unique users, we defined it for next section.
number_unique_users = len(db.doc.distinct('created.user'))
print 'The number of unique users is ' + str(number_unique_users)

The number of unique users is 91


In [23]:
# Top 1 contributing user
cursor = db.doc.aggregate([{'$group':{'_id':'$created.user', 'count':{'$sum':1}}}, {'$sort':{'count':-1}}, {'$limit':1}])
for res in cursor:
    user1=res['_id']
    user1_count=res['count']
print 'The first contributor is ' + user1 + ' with '+ str(user1_count) + ' contributions.'

The first contributor is TELCOM IP with 4921 contributions.


In [24]:
# Number of users appearing only once (having 1 post), we defined it for next section.
user_once=db.doc.aggregate([{'$group':{'_id':'$created.user', 'count':{'$sum':1}}}, 
                       {'$sort':{'count':1}},
                       {'$match':{'count':1}},
                       {'$group':{'_id':'null','total':{'$sum':'$count'}}}
                        ])
for res in user_once:
    number_user_once=res['total']

print 'There is ' + str(number_user_once) + ' users appearing only once.'

There is 22 users appearing only once.


# 4. Additional Ideas
### Contributor statistics and gamification suggestion
The contributions of users seems incredibly skewed, possibly due to automated versus manual map editing (the word “bot” appears in some usernames). Here are some user percentage statistics:  
1) Top1 Contributor is TELCOM IP, contribution percentage is 28%.  
2) Combined top 2 users are [u'TELCOM IP', u'negro'], contribution percentage is 42%.  
3) Combined top 10 users are [u'TELCOM IP', u'negro', u'Diego Sanguinetti', u'pizza4days', u'ovruni', u'WorstFixer',   u'greecemapper', u'Paper_', u'MintCondition', u'dbusse'], contribution percentage is 88%.  
4) 24% of users contribute with one post.    
5) Thinking about these user percentages from this graph below, I’m reminded of “gamification” as a motivating force for contribution. In the context of the OpenStreetMap, if user data were more prominently displayed, perhaps others would take an initiative in submitting more edits to the map. And, if everyone sees that only 10 of power users are creating more than 88% a of given map, that might spur the creation of more efficient bots, especially if certain gamification elements were present, such as rewards, badges, or a leaderboard.

In [25]:
# Calculate the percentage of contribution and find the tup N suers.
client = MongoClient()
db=client.project
number_document = db.doc.find().count()

def topn_contrib(n, user=False):
    if user==True:
        topuser=db.doc.aggregate([{'$group':{'_id':'$created.user', 'count':{'$sum':1}}}, 
                                 {'$sort':{'count':-1}}, {'$limit':n}
                                 ])
        top_n_users=[]
        for res in topuser:
            top_n_users.append(res['_id'])

    top_n_contrib=db.doc.aggregate([{'$group':{'_id':'$created.user', 'count':{'$sum':1}}}, 
                         {'$sort':{'count':-1}}, {'$limit':n},
                         {'$group':{'_id':'$created.user','total':{'$sum':'$count'}}}
                        ])

    for res in top_n_contrib:
        top_n_contrib_count=res['total']

    percent_contrib_topn=(top_n_contrib_count*100)/number_document
    
    if user==True:
        return top_n_users,percent_contrib_topn
    else:
        return percent_contrib_topn

In [26]:
top1,top1_percent_contrib=topn_contrib(1,user=True)
print 'Top1 Contributor is ' + top1[0] + ', contribution percentage is ' + str(top1_percent_contrib) + '%.'
top2,top2_percent_contrib=topn_contrib(2,user=True)
print 'Combined top 2 users are ' + str(top2) + ', contribution percentage is ' + str(top2_percent_contrib) + '%.'
top10,top10_percent_contrib=topn_contrib(10, user=True)
print 'Combined top 10 users are ' + str(top10) + ', contribution percentage is ' + str(top10_percent_contrib) + '%.'
percent_user_1post=(number_user_once*100)/number_unique_users
print str(percent_user_1post) + '% of users contribute with one post.'

Top1 Contributor is TELCOM IP, contribution percentage is 28%.
Combined top 2 users are [u'TELCOM IP', u'negro'], contribution percentage is 42%.
Combined top 10 users are [u'TELCOM IP', u'negro', u'Diego Sanguinetti', u'pizza4days', u'ovruni', u'WorstFixer', u'greecemapper', u'Paper_', u'MintCondition', u'dbusse'], contribution percentage is 88%.
24% of users contribute with one post.


### Additional data exploration using MongoDB queries
1) Top 10 appearing amenities

In [27]:
amenity = db.doc.aggregate([{'$match':{'amenity':{'$exists':1}}},
                               {'$group':{'_id':'$amenity', 'count':{'$sum':1}}},
                               {'$sort':{'count':-1}},
                               {'$limit':10}])

print list(amenity)

[{u'count': 22, u'_id': u'bus_station'}, {u'count': 12, u'_id': u'fuel'}, {u'count': 12, u'_id': u'school'}, {u'count': 7, u'_id': u'restaurant'}, {u'count': 5, u'_id': u'hospital'}, {u'count': 5, u'_id': u'place_of_worship'}, {u'count': 4, u'_id': u'marketplace'}, {u'count': 4, u'_id': u'veterinary'}, {u'count': 4, u'_id': u'townhall'}, {u'count': 4, u'_id': u'university'}]


2) Biggest religion

In [28]:
biggest_religion = db.doc.aggregate([{'$match':{'amenity':{'$exists':1}, 'amenity':'place_of_worship'}},
                    {'$group':{'_id':'$religion', 'count':{'$sum':1}}},
                    {'$sort':{'count':-1}}, {'$limit':1}])

print list(biggest_religion)

[{u'count': 3, u'_id': u'christian'}]


3) Most popular cuisines

In [29]:
popular_cuisines = db.doc.aggregate([{'$match':{'amenity':{'$exists':1}, 'amenity':'restaurant'}}, 
                    {'$group':{'_id':'$cuisine', 'count':{'$sum':1}}},
                    {'$sort':{'count':-1}}, {'$limit':1}])

print list(popular_cuisines)

[{u'count': 4, u'_id': None}]


# 5. Conclusion
1) The map about the city of San Jose is relatively clean so I could retrieve some interesting content. But still the data is not entirely clean.  
2) The data contains some mistakes or different references for the same feature. So I had to clean the data programmatically for the street and the postal codes.  
3) When we audit the data, it was very clear that although there are minor error caused by human input, the dataset is fairly well-cleaned. Considering there're hundreds of contributors for this map, there is a great numbers of human errors in this project. I'd recommend a srtuctured input form so everyone can input the same data format to reduce this error.  
4) We can incentivize users by gamify the contribution process, then we can create a recommendation engine to leverage these data (eg. restaurant recommendation, building, etc).  
5) OpenStreetMaps is an open source project, there're still a lot of areas left unexplored as people tend to focus on a certain key areas and left other part outdated. Since each node has a coordinate (lattitude & longtitude), we can resolve this issue by cross-referencing/cross-validating missing data from other database like Google API.

##### References
1) https://github.com/GuillaumeSalvan/P3-Wrangle-OpenStreetMap-Data  
2) https://github.com/lyvinhhung/Udacity-Data-Analyst-Nanodegree/tree/master/p3%20-%20Wrangle%20OpenStreetMap%20Data