# OpenStreetMap Data Wrangling for Dublin with MongoDB 

## Dublin Area
Here are the coordinates I used to analyze an area around Dublin Downtown:  
(node(53.3294,-6.3283,53.2646,-6.2121);<;);out meta;

![title](img/dublin.png)

## Problems Encountered in the Map

During the data wrangling process, I validated and corrected the following values: 

- Country  
The country code must be 'IE' which is Ireland's Iso code. There was only one invalid value for country('Ireland') that got converted to 'IE'.  


- City  
while validating the city, I found out the following values are valid :

 Dublin, MountMerrion, Churchtown, Stillorgan, Dundrum, Knocklyon,
 Ballyroan, Whitechurch, Rathfarnham, Booterstown, Goatstown, 
 Clonskeagh, Sandyford Industrial Estate, Rathmines, Donnybrook, Belfield  
 

- Postal Code  
The valid postal code format in Irland is \[A-Z\]\[0-9\](\[A-Z\]|\[0-9\]).

In [11]:
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
from collections import defaultdict

#valid country values 
ireland = re.compile(r'^(IE|Ireland)$')

#Created object properties
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

#Position object Properties
POSITION = [ "lat", "lon"]

#RG for one-colon elements   
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')

#RG for two-colon elements
lower_double_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*:([a-z]|_)*$')

#valid cities
city = re.compile(r'^(Dublin|Mount Merrion|Churchtown|Stillorgan|Dundrum|Knocklyon|Ballyroan|Whitechurch|Rathfarnham|Booterstown|Goatstown|Clonskeagh|Sandyford Industrial Estate|Rathmines|Donnybrook|Belfield)$')

#dublin other valid values
dublin_other_values = ["Dublin 6W", "Dublin 12", "Dublin 6", "Dublin 4", "Dublin 16", "Dublin 24" ]

#find the problamatic values 
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

#valid postal code format
postcode = re.compile(r'^([D][0-9]([0-9]|[W]))$')


#list of all ignored elements that has problem
ignored_element = []

#create json object based on element
def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way" :
        node["id"] = element.attrib["id"]
        node["type"] = element.tag
        for tag in element.iter():
            if tag.tag == 'tag':
                tag_key = tag.attrib["k"]
                if match(problemchars, tag_key):
                    ignored_element.append(tag_key)
                    continue
                if match(lower_double_colon , tag_key):
                    ignored_element.append(tag_key)
                    continue
                if match(lower_colon , tag_key):
                    parse_colon(node, tag, tag_key)
                else:
                    is_valid ,return_value = Convert(tag_key, tag.attrib['v'])
                    if is_valid:
                        node[tag_key] = return_value
            elif tag.tag == 'nd':
                if 'node_refs' not in node:
                    node['node_refs'] = []
                node['node_refs'].append(tag.attrib['ref'])

        for key,value in element.attrib.iteritems():
            if key in CREATED:
                if 'created' not in node:
                    node['created'] = {}
                node["created"][key] = value
            elif key in POSITION:
                if 'pos' not in node:
                    node['pos'] = [None]*2
                if key == 'lat':
                    node["pos"][0] = float(value)
                elif key == 'lon':
                    node["pos"][1] = float(value)
            else:
                node[key] = value       
        return node
    else:
        return None

# create two level objects
def parse_colon(node, element, tag_key):
    key1, key2 = tag_key.split(':')
    
    if key1 not in node:
        node[key1] = {}
    elif type(node[key1]) is not dict:
        type_name = node[key1]
        node[key1] = {}
        node[key1]["type"] = type_name    
    is_valid_value, final_value = check_value(tag_key, element.attrib['v'])    
    if is_valid_value:
        node[key1][key2] = final_value

# validate and convert value for country, city , postalcode
def Convert(key, value): 
    if key == 'addr:country':
        m = ireland.search(value)
        if m:
            return True,"IE"
        ignored_element.append(value)
        return False,""
    if key == 'addr:city':        
        m = city.search(value)
        if m:
            return True,value
        elif value in dublin_other_values:
            return True,"Dublin"
        ignored_element.append(value)
        return False,""
    if key == 'addr:postcode':        
        m = postcode.search(value)
        if m:
            return True,value
        ignored_element.append(value)
        return False,""
    return True,value

#match regular expression
def match(rg,item):
    m = rg.search(item)
    if m:
        return True
    return False

#main function
def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

data = process_map('dublin.osm', True)
#print ignored_element

## Data Overview
Here is the overview statistics of the dataset

dublin.osm      58,280KB  
dublin.osm.json 73,359KB  

>db.dublin.find().count()  
721,184  

>db.dublin.find({"type":"node"}).count()  
626,818  

>db.dublin.find({"type":"way"}).count()  
94,355  



### Other queries
 
Top 10 contributers :

> db.dublin.aggregate([{"\$group":{"_id" : "\$created.user","count" : {"\$sum":1}}}, {"\$sort" : {"count":-1}},{"\$limit":10}])

{ "_id" : "brianh", "count" : 164323 }  
{ "_id" : "Nick Burrett", "count" : 123763 }  
{ "_id" : "Dafo43", "count" : 100971 }  
{ "_id" : "VictorIE", "count" : 92975 }  
{ "_id" : "maguinek", "count" : 56736 }  
{ "_id" : "mackerski", "count" : 32582 }  
{ "_id" : "IrlJidel", "count" : 22571 }  
{ "_id" : "conorb", "count" : 15772 }  
{ "_id" : "ManAboutCouch", "count" : 11326 }  
{ "_id" : "GoldCircle", "count" : 8079 }  


## Additional Ideas
