## Choose Your Map Area
#### san-francisco_california.osm

####Count all types of tags in the xml file

In [12]:

import xml.etree.cElementTree as ET
from collections import defaultdict
import re

# Define a function to count all types of tags in the xml file
def count_tags(filename):
    tags_dict = defaultdict(int)
    for event, elem in ET.iterparse(filename, events=("start",)):
        tags_dict[elem.tag] += 1
    return tags_dict
#Count all tags 
count_tags("san-francisco_california.osm")


defaultdict(<type 'int'>, {'node': 4482203, 'nd': 5341923, 'bounds': 1, 'member': 43488, 'tag': 1571906, 'osm': 1, 'way': 514729, 'relation': 7026})

####Method to calculate unique users. 

In [7]:
# Count unique  method based on key
def count_unique(filename, key):
    unique_items = set()
    for _, element in ET.iterparse(filename):
        if key in element.attrib:
            unique_items.add(element.get(key))
    print(len(unique_items))
    
# Find all unique user count
count_unique("san-francisco_california.osm", "user")


3536


####Main method and helper methods to clean the data and generate json file

In [27]:
from collections import defaultdict
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json

# Regex for invalid_chars and street_type
invalid_chars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

mapping = {"St": "Street",
           "St.": "Street",
           "street": "Street",
           "Rd.": "Road",
           "Rd": "Road",
           "Ave": "Avenue",
           "Ave.": "Avenue",
           "Blvd": "Boulevard",
           "Blvd.": "Boulevard",
           "Boulevade": "Boulevard",
           "Cir": "Circle",
           "Cres": "Crescent",
           "Cressent": "Crescent",
           "Crt.": "Court",
           "Dr": "Drive",
           "Dr.": "Drive",
           "Driver": "Drive",
           "Terace": "Terrace"
          }

#*********************************************************
#Common methods used for cleaning and converting data.
#*********************************************************
##Set element details based on element object
def set_element_details(json_obj,element):
    set_elements(json_obj,element, ["id","visible"])

def set_position(json_obj, element):
     ## set lon and lat positions as 0 if invalid or not present
    json_obj["pos"] = [0, 0]
    if element.get("lat"):
        try:
            json_obj["pos"] = [float(element.get("lat")), float(element.get("lon"))]
        except TypeError:  
            json_obj["pos"] = [0, 0]

def set_sub_element(json_obj,element):
    json_obj["created"] = {}
    set_elements(json_obj["created"],element,["version", "changeset", "timestamp", "user", "uid"])

def set_elements(json_obj, element, attributes):
    for attribute in attributes:
        json_obj[attribute] = element.get(attribute) 
        
# split first and second part of the colon.
def split_colon(string, pos):
    if pos == "first":
        return string[:(string.index(":"))]
    elif pos == "second":
        return string[(string.index(":")+1):]
# Convert to camel case
def camelCase(st):
    return ' '.join(''.join([w[0].upper(), w[1:].lower()]) for w in st.split())

# Check whether the street name is valid or not
def is_valid_street(street_name):
    m = street_type_re.search(street_name)
    if m:
        if m.group() in mapping.keys():
            return True
        else:
            return False
        
# Funtion to update street name 
def update_street_name(name, mapping):
    m = street_type_re.search(name)
    if m and (m.group() in mapping.keys()):
        return re.sub(street_type_re, mapping[m.group()], name)
    else: 
        return name

# Iterate through all the tags in the element and add it to the dictionary.
def build_dictionary(element,json_object):
            # Iterate through all the tags in the element to add them into the "json_object" dict
        for elem in element.iter():
            if elem.tag == "nd":
                json_object["node_refs"].append(elem.get("ref"))
            elif elem.tag == "tag":
                k_val = elem.get("k")
                v_val = elem.get("v")

                # update the v_val if it's a street name and problem is found
                if k_val == "addr:street":
                    if is_valid_street(v_val):
                        v_val = update_street_name(v_val, mapping)
                ## Invalid postal code like 豊川市, CA9410, CA, -
                if k_val == "addr:postcode":
                    if v_val:
                        v_val = re.sub("[^0123456789\.]","",str(v_val))
                if k_val == "addr:city":
                    if v_val:
                         if not isinstance(v_val, unicode):
                            v_val = unicode(v_val, 'utf-8')
                            v_val = camelCase(str(v_val))
                if invalid_chars.match(k_val):
                    continue
                elif ":" in k_val:
                    if len(re.findall(":", k_val)) > 1: 
                        continue
                    else: 
                        json_object[split_colon(k_val, "first")][split_colon(k_val, "second")] = v_val
                elif k_val in json_object.keys():
                    json_object[k_val][k_val] = v_val
                else: 
                    json_object[k_val] = v_val

        return json_object
# Convert xml element to json object
def convert_to_json(element):
    # Initialize json object 
    json_object = {}
    #If element tag is node and way
    if element.tag == "node" or element.tag == "way" :
        # Set element tag
        json_object["tag"] = element.tag
        
        # set element details based on element object.
        set_element_details(json_object,element)
        
        # set element attributes based on element object.
        set_sub_element(json_object,element)

        if element.findall('nd'):
            json_object["node_refs"] = []

        set_position(json_object,element)
        
        key_set = set()
        for tag in element.iter("tag"): 
            k_val = tag.get("k")
            if ":" in k_val:
                key_set.add(split_colon(k_val, "first")) 
        # Initialize dictionary
        for key in key_set: 
            json_object[key] = {}
            
        build_dictionary(element,json_object)
        return json_object   
    else:
        return None

#Main method to read osm file and convert to json file
def convert_osm_to_json(file_name):
    #Create a json file based on the osm file name
    json_file = "{0}.json".format(file_name)
    data = []
    with codecs.open(json_file, "w") as file_out:
        #Iterate through all the xml elements in the file
        for _, element in ET.iterparse(file_name):
            #Convert each element to json object
            el = convert_to_json(element)
            #if json object is present write to json file
            if el:
                file_out.write(json.dumps(el, indent=5)+"\n")
    return data

# Method call to convert osm to json
convert_osm_to_json("san-francisco_california.osm")

[]

In [25]:
print camelCase("OAKLAND")

Oakland


##Problems encounterd 
* Incosistent Abreviated addresses 
    * Updated to use consitent values from the mapping dictionary
    * Before : 'St', 'Ave'
    * After : 'Street' 'Avenue'
* Invalid characters 
    * Invalid characters are ignored
    * re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]').match(k_val):
* null values and missing attributes
    * There are lot of Missing attributes in each node which were ignored.
* Incosistent postal codes were present in the data like 豊川市, CA9410, CA
    * Removed all trailing and leading spaces
    * Removed all non numeric values
    * Removed other language values.
    * Removed state abbrviations from postcode.
    * Before: '豊川市', 'CA9410', 'CA'
    * After:'','9410',''
* Incosistent city names
    * Converted Capital city names to camel case
    * Mixed city names to camel case
    * Before: 'OAKLAND', 'san francisco', 'daly City'
    * After: 'Oakland', 'San francisco', 'Daly City'  
    * Code: ' '.join(''.join([w[0].upper(), w[1:].lower()]) for w in st.split())
* Invalid position object with lon and lat 
    * If the lon and lat is not a valid string or not a valid float value initialized to zero
    * Code:
    try:
        json_obj["pos"] = [float(element.get("lat")), float(element.get("lon"))]
    except TypeError:  
        json_obj["pos"] = [0, 0]
* Removed hyphen to extract values which have hypen's in the val.
   * Before 'san-francisco'
   * After 'San Francisco'
* Moved the attributes related to the user to created object.
   * Code snipet:
   def set_elements(json_obj, element, attributes):
    for attribute in attributes:
        json_obj[attribute] = element.get(attribute) 
    set_elements(json_obj["created"],element,["version", "changeset", "timestamp", "user", "uid"])
* Moved address related attributes under addr object



##Other ideas about the dataset
* While collecting the data. Marking some of the fields as mandatory will help in improving quality of data. This option will not help in cleaning up the existing data. This will help us in improving the quality of the newly created data.
    * Problem with this option would be: Users cannot feed in the data without having the complete data. In some cases users may not have all the attributes. Quality of the data will improve but the quantity of the data may go down.
   
   
* Provide auto completions or drop down to collect consistent data instead of allowing user to type in anything. Even this will help us in improving the quality of the new data.
    * Pre populating all the data required for auto completion and drop downs could be an issue. We have to spend more time in getting all the metadata before collecting the actual data


* Other ways to improve data is by validating the details with other api's like Google/Yahoo and showing the user with  suggestions to correct before they actually save the data. This option can help us in clean existing dataset and to the all new additions of the dataset.
    * Problems with this approach would be that we would be trusting the other services more than the user entered data. In some cases we may repeat the issues which exists in the other services.


* Other option would be to get the same data from different users and save multiple copies. Create new data set based on the most matching values. Ex: If three users enter :San Francisco, San Francisco , SF . We take San Francisco as the correct value as most people have entered as San Francisco. Using this option would require more effort in collecting and correcting the data.
    * Problems using this approach would be that we would be collecting and storing more than 2x of data than usual. Usually we would be storing 1 record per position but using this approach we have to store many records per position.  


* Using Speech to text conversion software to take the input instead of typing in the details. Existing data cannot be corrected using this approach. This will help us to collect data in the future easily.
    * There could be a problem recording the data using speech to text because of different languages and different accents. There is as possiblity of recording wrong data and validating the recordings could be complicated.


##Data Overview & additional queries
* 958.8 MB san-francisco_california.osm
* 1.87 GB san-francisco_california.osm.json


####Total number of documents
* db.sanfrancisco.count()
* 4996932

####Total number of nodes
* db.sanfrancisco.find({'tag':'node'}).count()
* 4482203

####Total number of ways
* db.sanfrancisco.find({'tag':'way'}).count()
* 514729

####Unique users
* db.sanfrancisco.distinct('created.user').length
* 3191

####Recently created based on time stamp

* db.sanfrancisco.aggregate([{$sort:{'created.timestamp':-1}},{$limit:1}])
* { "_id" : ObjectId("5706df8987b8bfa46b49a4bb"), "visible" : null, "tag" : "node", "pos" : [ 37.776917, -122.179906 ], "id" : "4079290925", "created" : { "uid" : "3696217", "changeset" : "38074868", "version" : "1", "user" : "Kristymarie42", "timestamp" : "2016-03-25T23:50:12Z" } }

####Regex based count
* db.sanfrancisco.find({operator:{$regex:'.*way.*'}}).count()
* 48

####Group by highway 
* db.sanfrancisco.aggregate([{$group:{'_id':'$highway','count':{$sum:1}}},{$sort:{count:-1}},{$limit:10}])
* { "_id" : null, "count" : 4891251 }
{ "_id" : "residential", "count" : 32948 }
{ "_id" : "service", "count" : 17360 }
{ "_id" : "footway", "count" : 10183 }
{ "_id" : "turning_circle", "count" : 5940 }
{ "_id" : "secondary", "count" : 4937 }
{ "_id" : "crossing", "count" : 3867 }
{ "_id" : "tertiary", "count" : 3802 }
{ "_id" : "traffic_signals", "count" : 3397 }
{ "_id" : "primary", "count" : 3269 }

####Group by religion 
* db.sanfrancisco.aggregate([{$group:{'_id':'$religion','count':{$sum:1}}},{$sort:{count:-1}},{$limit:10}])
{ "_id" : null, "count" : 4995820 }
{ "_id" : "christian", "count" : 1036 }
{ "_id" : "buddhist", "count" : 33 }
{ "_id" : "jewish", "count" : 20 }
{ "_id" : "muslim", "count" : 8 }
{ "_id" : "taoist", "count" : 3 }
{ "_id" : "unitarian_universalist", "count" : 2 }
{ "_id" : "unitarian", "count" : 2 }
{ "_id" : "scientologist", "count" : 2 }
{ "_id" : "eckankar", "count" : 1 }

####Group by cuisine 
* db.sanfrancisco.aggregate([{$group:{'_id':'$cuisine','count':{$sum:1}}},{$sort:{count:-1}},{$limit:2}])
{ "_id" : null, "count" : 4994452 }
{ "_id" : "mexican", "count" : 247 }

##Others
* Residential highway type data has more records than anyother highway type data. 
* There are 3191 unique users who contributed to the data.
* Recently created record was by the user Kristymarie42 at 2016-03-25T23:50:12Z.
* Oldest record was created by Deanna Earley at 2006-07-02T20:31:53Z.
* Top 3 religions are distributed as
    * christian - 1036, buddhist - 33, jewish - 33.
* Mexican is the top domination cusine.

## Conclusion
I feel that the osm data was not very well organized and the naming conventions were not consistent. There was duplication and missing of data in some cases. Difficult moments were with the data set which I choose was big. I had to wait for long time to process every time I run my python scripts for cleaning.
