# Data cleaning and wrangling for Project3 
## Python code with documentations

Note: Some of the codes are missing a number inside the bracket of 'In [ ]' because these codes were combined in this one file at the end. Every code was run more than once with several samples or the original data. I try to make each code cell as independent as possible.

## 1. Sample data construction

In [None]:
### Code provided in project details ###

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import xml.etree.cElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "portland_oregon.osm"  
SAMPLE_FILE = "portland_oregon_sample_k200.osm"

k = 200 # Parameter: take every k-th top level element 

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

The size of original OSM XML file is 1.5 GB. I used the code provided to make samples of different sizes. I used samples with k=10, 100, or 200 in the data wrangling process. I will submit the smallest sample with k=200 since its size is 8 MB, which is in the suggested range of 1-10 MB.(k=10: 156MB, k=100: 15.6 MB)

## 2. Data wrangling

### Auditing tag names

In [12]:
### Case Study scripts. The file name was changed for the project ###

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Your task is to use the iterative parsing to process the map file and
find out not only what tags are there, but also how many, to get the
feeling on how much of which data you can expect to have in the map.
Fill out the count_tags function. It should return a dictionary with the 
tag name as the key and number of times this tag can be encountered in 
the map as value.

Note that your code will be tested with a different data file than the 'example.osm'
"""
import xml.etree.cElementTree as ET
import pprint

def count_tags(filename):
        # YOUR CODE HERE
    tag_dic ={}
    key_list = set()
    for event, elem in ET.iterparse(filename):  #filename can go directly here where osm_file
        if elem.tag in key_list:
            tag_dic[elem.tag] = tag_dic[elem.tag]+1
        else:
            tag_dic[elem.tag]=1
            key_list.add(elem.tag)
    return tag_dic
        
def test():

    tags = count_tags('portland_oregon_sample_k200.osm')   #file name replaced with my data
    pprint.pprint(tags)
    
if __name__ == "__main__":
    test()

{'member': 607,
 'nd': 38353,
 'node': 32890,
 'osm': 1,
 'relation': 32,
 'tag': 24801,
 'way': 4251}


### Auditing users

In [13]:
### Case Study scripts. The file name was changed for the project ###

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
The first task is a fun one - find out how many unique users
have contributed to the map in this particular area!

The function process_map should return a set of unique user IDs ("uid")
"""

def get_user(element):
    return

def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        # my code here
        try:
            users.add(element.attrib["uid"])
        except:
            continue
    return len(users), users

def test():

    users = process_map('portland_oregon_sample_k200.osm')
    pprint.pprint(users)

if __name__ == "__main__":
    test()

(298,
 set(['1004928',
      '1007528',
      '1012362',
      '101433',
      '1051550',
      '110263',
      '110767',
      '1110270',
      '11131',
      '115918',
      '117854',
      '120146',
      '120500',
      '1211761',
      '121241',
      '1213011',
      '121502',
      '1219875',
      '1238559',
      '1239795',
      '1240849',
      '1261393',
      '129255',
      '129535',
      '1297659',
      '135163',
      '1370046',
      '1372934',
      '137524',
      '137875',
      '1387291',
      '1399823',
      '1406447',
      '1408522',
      '1425824',
      '1426091',
      '1426189',
      '14293',
      '1442206',
      '144449',
      '145201',
      '145231',
      '146549',
      '147510',
      '149434',
      '152074',
      '152392',
      '1529630',
      '155562',
      '1597155',
      '1679',
      '1694420',
      '1717869',
      '172507',
      '1731253',
      '1772245',
      '1797508',
      '1802220',
      '18069',
      '1817244',
      '

### Auditing street types

In [22]:
### Modified Case Study scripts for the project ###

import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "portland_oregon_sample_k10.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

# Original list in the case study script
# expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
#            "Trail", "Parkway", "Commons"]

# Modified list for this map area
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Path","Run","Woods","Heights",
            "Broadway", "Circle", "Highway", "Loop", "Terrace", "Way","Circus","Byway","Chantilly",
           "Churchill","Alley","End","Curve", "Crest","Point", "View","Summit","Downs","Preakness",       
           "North","South","East","West"] 

# Modified list for this map area
mapping = { "St": "Street",
            "St.": "Street",
            "Rd": "Road",
           "Rd.": "Road",
           "Ave": "Avenue"
          }

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for _, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

def update_name(name, mapping):

    # YOUR CODE HERE
    for st_type in mapping:
        if name.endswith(st_type):
            name = name.replace(st_type, mapping[st_type])
            print name
    return name


def test():
    st_types = audit(OSMFILE)
    pprint.pprint(dict(st_types))
    
    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
#            print name, "=>", better_name

if __name__ == '__main__':
    test()

{'101': set(['Northwest Hoyt Street #101']),
 '156th': set(['Southwest 156th']),
 '157th': set(['Southwest 157th']),
 '158th': set(['Southwest 158th']),
 '163rd': set(['Southwest 163rd']),
 '165th': set(['Southwest 165th']),
 '211': set(['Highway 211', 'South Highway 211', 'Southeast Highway 211']),
 '212': set(['Southeast Highway 212']),
 '213': set(['Highway 213', 'South Highway 213']),
 '224': set(['Northwest Highway 224',
             'Southeast Highway 224',
             'Southwest Highway 224']),
 '26': set(['Southeast Highway 26']),
 '4637': set(['4637']),
 '47': set(['Northwest Highway 47',
            'Southwest Highway 47',
            'Southwest Old Highway 47']),
 '97055': set(['44575 Southeast Kleinsmith Rd.\nSandy, OR 97055']),
 '99': set(['Northeast Highway 99']),
 '99E': set(['South Highway 99E']),
 '99W': set(['Northeast State Highway 99W', 'Southwest Old Highway 99W']),
 '99e': set(['South Highway 99e']),
 'Ave': set(['4th Ave', 'SE 16th Ave', 'SE 60th Ave', 'SE 96th 

Overall, street names already look pretty clean. The original case study script cought some street types which were not included in the expected street types in the orignal script, but I found many of them are appropriate street types. There were also many street names that end with a direction (e.g., Southwest Willamette Way East) I included some of valid street types and directions in the list of expected street types and ran the code again. I did not add some unfamiliar street types that were used only once, but they are likely to be valid. 

The above output shows there are still a few street types left to audit. They are as followings:
* Highway followed by a highway number: e.g., Southeast Highway 211. There are appropriate street names for an address. 
* Missing street types: e.g., 'Southwest 158th','Botticelli'. These are problematic street name since it is missing a street type. I googled 'Southwest 158th' and found there are more than one street type with 'Southwest 158th'. I know it has to be 'Southwest 158th Avenue' because of its zipcode 97007 listed in the next line of data. However, it is not feasible to update one by one by googling. I think correcting all of these street names using other information programmatically requires techniques beyond this course.
* Some street names entered with errors. For example, there were full adresses or street names followed by an apartment number. I did not change these.

### Any problematic "k" values?

In [9]:
### Modified Case Study scripts ###

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
Before you process the data and add it into your database, you should check the
"k" value for each "<tag>" and see if there are any potential problems.

We have provided you with 3 regular expressions to check for certain patterns
in the tags. As we saw in the quiz earlier, we would like to change the data
model and expand the "addr:street" type of keys to a dictionary like this:
{"address": {"street": "Some value"}}
So, we have to see if we have such tags, and if we have any tags with
problematic characters.

Please complete the function 'key_type', such that we have a count of each of
four tag categories in a dictionary:
  "lower", for tags that contain only lowercase letters and are valid,
  "lower_colon", for otherwise valid tags with a colon in their names,
  "problemchars", for tags with problematic characters, and
  "other", for other tags that do not fall into the other three categories.
See the 'process_map' and 'test' functions for examples of the expected format.
"""

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":

        # YOUR CODE HERE
        if lower.search(element.attrib['k']):
            keys["lower"]=keys["lower"]+1  # keys["lower"]+=1 also works
        elif lower_colon.search(element.attrib['k']):
            keys["lower_colon"]=keys["lower_colon"]+1
        elif problemchars.search(element.attrib['k']):
             keys["problemchars"]=keys["problemchars"]+1        
        else:
            keys["other"]=keys["other"]+1
            others.add(element.attrib['k']) #add a new 'other' tag
    return keys

def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)
    return keys

def test():
    global others 
    others = set() 
    keys = process_map('portland_oregon_sample_k10.osm')
    pprint.pprint(keys)
    pprint.pprint(others)  #check what 'other' tags are

if __name__ == "__main__":
    test()

{'lower': 247343, 'lower_colon': 245062, 'other': 3567, 'problemchars': 0}
set(['BLDG_ADDR',
     'CCGIS:bicycle',
     'CCGIS:reviewed',
     'CCGIS:trailid',
     'Category',
     'FG:COND_INDEX',
     'FG:GPS_DATE',
     'FG:ORG_CODE',
     'FG:PROP_NO',
     'FG:RTE',
     'FG:area',
     'FG:datafile',
     'FG:lane_miles',
     'FG:perimeter',
     'FG:photo',
     'FG:rte_description',
     'FG:station',
     'FG:visitors',
     'FIXME',
     'ID',
     'ISO3166-2',
     'NHS',
     'ORIG_FID',
     'Phone',
     'RLIS:bicycle',
     'RLIS:bicycle:left',
     'RLIS:hydro_id',
     'RLIS:park_id',
     'RLIS:reviewed',
     'RLIS:systemname',
     'RLIS:systemtype',
     'RLIS:trailid',
     'addr:postcode:left',
     'addr:postcode:right',
     'building:levels:underground',
     'catmp-RoadID',
     'change:lanes:backward',
     'change:lanes:both_ways',
     'change:lanes:bus',
     'change:lanes:forward',
     'description_1',
     'description_2',
     'description_3',
     

I explored the data using the Case Study scripts that categorize "k" values for each tag into 4 cases, "lower", "lower_colon", "other" and "problemchars". Surprisingly, I found there are no "k" values in "problemchars". I also audited "other" cases, but found no problem. Many of 'other' tags are caught due to uppercase letters. These are usually not problematic, but have potential problems. Misused ones will be cleaned later. Some of 'other' tags are caught becuase they have two colons, but they will be taken care of well in my final cleaning.

### Auditing "k" values for node tags

In [23]:
### Auditing node tag "k" values ###

import xml.etree.cElementTree as ET
import pprint

   
def audit_node_tag_k(filename):
    kvalues = set()
    for _, element in ET.iterparse(filename):
        if element.tag == 'node':
            for child in element.findall('tag'):  
                kvalues.add(child.attrib["k"])

    return kvalues

def test():

    kvalues = audit_node_tag_k('portland_oregon_sample_k10.osm')
    pprint.pprint(kvalues)

if __name__ == "__main__":
    test()


set(['Category',
     'Phone',
     'access',
     'addr:city',
     'addr:country',
     'addr:floor',
     'addr:full',
     'addr:housename',
     'addr:housenumber',
     'addr:postcode',
     'addr:state',
     'addr:street',
     'addr:unit',
     'advertising',
     'aerodrome:type',
     'aeroway',
     'alt_name',
     'amenity',
     'artist_name',
     'artwork_type',
     'atm',
     'attribution',
     'backrest',
     'barrier',
     'bench',
     'bicycle',
     'bicycle_parking',
     'bin',
     'board_type',
     'brand',
     'building',
     'bus',
     'button_operated',
     'capacity',
     'car_wash',
     'census:population',
     'closest_town',
     'clothes',
     'collection_times',
     'colour',
     'contact:phone',
     'contact:website',
     'content',
     'cost:coffee',
     'country',
     'covered',
     'craft',
     'created_by',
     'crossing',
     'crossing:barrier',
     'ctran:reviewed',
     'cuisine',
     'cycleway',
     'delivery',
  

Suspicious values:
* gnis:County_num vs. gnis:county_id 
* gnis:County vs. gnis:county_name
* 'Phone' vs. 'phone' (I will also check other values including 'phone')

### Checking "k" and "v" values of node tags with suspicious "k" values

In [18]:
### Checking "k" and "v" values of node tags with suspicious "k" values ###

import xml.etree.cElementTree as ET
import pprint

Suspicious = ["gnis:County_num","gnis:county_id", "gnis:County", "gnis:county_name",
              'Phone','phone','contact:phone','phone:alternate','phone:tollfree']

def audit_node_tag_k_suspicious(filename):
    kv_values = {}
    for _, element in ET.iterparse(filename):
        if element.tag == 'node':
            for child in element.findall('tag'):  
                if child.attrib["k"] in Suspicious: 
                    if child.attrib["k"] not in kv_values:
                        kv_values[child.attrib["k"]] = [child.attrib["v"]]
                    else:
                        if child.attrib["v"] not in  kv_values[child.attrib["k"]]:
                            kv_values[child.attrib["k"]].append(child.attrib["v"])
    return kv_values

def test():

    kv_values = audit_node_tag_k_suspicious('portland_oregon_sample_k10.osm')
    pprint.pprint(kv_values)



if __name__ == "__main__":
    test()


{'Phone': ['503-238-6330'],
 'contact:phone': ['+1-360-882-4082'],
 'gnis:County': ['Clackamas',
                 'Columbia',
                 'Clark',
                 'Multnomah',
                 'Washington',
                 'Marion',
                 'Yamhill'],
 'gnis:County_num': ['005', '009', '011', '051', '067', '047', '071'],
 'gnis:county_id': ['011', '059', '009', '067', '071', '051', '047'],
 'gnis:county_name': ['Marion',
                      'Clackamas',
                      'Clark',
                      'Multnomah',
                      'Washington',
                      'Skamania'],
 'phone': ['+1-503-644-5748',
           '+1-503-595-1205',
           '+1-503-245-3183',
           '+1 (503) 282-9603',
           '+1-503-643-9528',
           '+1-503-284-2300',
           '+1-503-682-2611',
           '+1-503-227-8240',
           '+1-503-227-1109',
           '+1-503-682-9674',
           '+1-503-570-8888',
           '+1-503-826-0901',
           '+1-503-907-3

* As I expected "gnis:County_num" and "gnis:county_id" seem to be used interchangeably, so do "gnis:County and gnis:county_name". Actually, the following website shows both "gnis:County_num" and "gnis:county_id" represent County FIPS code: 
http://wiki.openstreetmap.org/wiki/USGS_GNIS .
The Wiki page also says "County" in "gnis:County" is a county name.

* "Phone" and "phone"were used interchangeably. I will replace "Phone" with "phone"

* Most phone numbers are in this format '+1-###-###-####'. Some phone numbers are not in the format. For example, some numbers are '503-238-6330','(360) 258-1713', '+01-503-639-1712' or '5032535327'. I will make every phone number to have this format '+1-###-###-####'

I wanted to check way tags as well in order to update these problematic tags more systematically.  

### Auditing  "k" values for way tags

In [12]:
### Auditing way tag "k" values ###

import xml.etree.cElementTree as ET
import pprint

   
def audit_way_tag_k(filename):
    kvalues = set()
    for _, element in ET.iterparse(filename):
        if element.tag == 'way':
            for child in element.findall('tag'):  
                kvalues.add(child.attrib["k"])

    return kvalues


def test():

    kvalues = audit_way_tag_k('portland_oregon_sample_k10.osm')
    pprint.pprint(kvalues)



if __name__ == "__main__":
    test()


set(['BLDG_ADDR',
     'CCGIS:bicycle',
     'CCGIS:reviewed',
     'CCGIS:trailid',
     'FG:COND_INDEX',
     'FG:GPS_DATE',
     'FG:ORG_CODE',
     'FG:PROP_NO',
     'FG:RTE',
     'FG:area',
     'FG:datafile',
     'FG:lane_miles',
     'FG:perimeter',
     'FG:photo',
     'FG:rte_description',
     'FG:station',
     'FG:visitors',
     'ID',
     'NHS',
     'ORIG_FID',
     'RLIS:bicycle',
     'RLIS:bicycle:left',
     'RLIS:hydro_id',
     'RLIS:park_id',
     'RLIS:reviewed',
     'RLIS:systemname',
     'RLIS:systemtype',
     'RLIS:trailid',
     'abandoned:highway',
     'abutters',
     'access',
     'access:lanes',
     'addr:city',
     'addr:country',
     'addr:county',
     'addr:full',
     'addr:housename',
     'addr:housenumber',
     'addr:postcode',
     'addr:postcode:left',
     'addr:postcode:right',
     'addr:state',
     'addr:street',
     'addr:unit',
     'admin_level',
     'aeroway',
     'aircraft',
     'alt_name',
     'amenity',
     'animal

* I found 'gnis:county_id', 'gnis:county_name', and 'tiger:county', but they do not seem to be problematic. I will check "v" values of those three anyway.
* 'historic' vs.'history'
* 'servic' vs. 'service'
* 'dog' vs. 'dogs'
* I will check phone numbers as I did for node tags

### Checking "k" and "v" values of way tags with suspicious "k" values

In [19]:
### Checking "k" and "v" values of way tags with suspicious "k" values ###

import xml.etree.cElementTree as ET
import pprint

Suspicious = ['gnis:county_id','gnis:county_name','tiger:county',
              'historic','history','servic','service','dog','dogs',
             'phone','contact:phone','phone:alternate','phone:tollfree']

def audit_way_tag_k_suspicious(filename):
    kv_values = {}
    for _, element in ET.iterparse(filename):
        if element.tag == 'way':
            for child in element.findall('tag'):  
                if child.attrib["k"] in Suspicious: 
                    if child.attrib["k"] not in kv_values:
                        kv_values[child.attrib["k"]] = [child.attrib["v"]]
                    else:
                        if child.attrib["v"] not in  kv_values[child.attrib["k"]]:
                            kv_values[child.attrib["k"]].append(child.attrib["v"])
    return kv_values

def test():

    kv_values = audit_way_tag_k_suspicious('portland_oregon_sample_k10.osm')
    pprint.pprint(kv_values)



if __name__ == "__main__":
    test()


{'contact:phone': ['+1-360-574-3242'],
 'dog': ['no', 'yes'],
 'dogs': ['no'],
 'gnis:county_id': ['067', '051', '071', '011', '047', '009'],
 'gnis:county_name': ['Multnomah', 'Washington', 'Clark', 'Clackamas'],
 'historic': ['monument'],
 'history': ['Retrieved from v1', 'Retrieved from v4', 'Retrieved from v5'],
 'phone': ['+1-503-356-2130',
           '+1-503-431-5600',
           '+1-503-916-5280',
           '+1-503-263-7120',
           '+1-503-356-2460',
           '+1-503-554-4850',
           '+1-503-916-6343',
           '+1-503-528-0500',
           '+1-503-493-2715',
           '+1-503-280-1300',
           '+1-503-284-8040',
           '+1-503-284-4693',
           '+1-503-206-7283',
           '+1-503-230-2660',
           '503-208-2021',
           '+1-503-872-3300',
           '+1-503-231-5522',
           '+1-503-232-6601',
           '+1-503-226-4368',
           '+1-503-234-0129',
           '+1-503-287-2262',
           '+1-503-227-3719',
           '+1-503-667-90

* Although county_name and county were used interchageably, their key types are different. Also, only 'Clackamas' is missing its state name, but its "k" value is different from other counties with state names. Thus, I decided not to change anything for these. However, there might be same problematic tags in way tags as in node tags. I will change "gnis:County_num" into "gnis:county_id" and "gnis:County" into "gnis:county_name"  to be consistent and more specific for both node and way tags.
* 'historic' vs.'history': They are different values, I will not change these.
* 'servic' vs. 'service': It looks 'servic' was a typo. I replace 'servic' with 'service'.
* 'dog' vs. 'dogs': They seem to be used interchangeably. I will replace 'dogs' with 'dog'.
* There are some phone numbers with a format different from '+1-###-###-####'. I will make every phone number to have this format '+1-###-###-####', but the special case like '503-683-5359 Fax: 503-683-4913' will not be changed.

### Updating problematic "k" values

In [24]:
### Modified Case Study scripts to replace k value
### "gnis:County_num" with "gnis:county_id" and 
### "gnis:County" with "gnis:county_name"
### for node and way tags

import xml.etree.cElementTree as ET
import pprint

mapping = { "gnis:County_num":"gnis:county_id", "gnis:County":"gnis:county_name","Phone":"phone",
           "servic":"service","dogs": "dog"}

def update_kvalue(kvalue, mapping):  # to be used in shape_element function

    if kvalue in mapping:
        kvalue = mapping[kvalue]

    return kvalue

def test():
    for _, element in ET.iterparse('portland_oregon_sample_k10.osm'):
        for child in element.findall('tag'):
            kvalue = child.attrib["k"]
            better_kvalue = update_kvalue(kvalue,mapping)
            if better_kvalue != kvalue:
                print kvalue, " => ", better_kvalue

if __name__ == '__main__':
    test()

gnis:County  =>  gnis:county_name
gnis:County_num  =>  gnis:county_id
gnis:County  =>  gnis:county_name
gnis:County_num  =>  gnis:county_id
gnis:County  =>  gnis:county_name
gnis:County_num  =>  gnis:county_id
gnis:County  =>  gnis:county_name
gnis:County_num  =>  gnis:county_id
gnis:County  =>  gnis:county_name
gnis:County_num  =>  gnis:county_id
gnis:County  =>  gnis:county_name
gnis:County_num  =>  gnis:county_id
gnis:County  =>  gnis:county_name
gnis:County_num  =>  gnis:county_id
gnis:County  =>  gnis:county_name
gnis:County_num  =>  gnis:county_id
gnis:County  =>  gnis:county_name
gnis:County_num  =>  gnis:county_id
gnis:County  =>  gnis:county_name
gnis:County_num  =>  gnis:county_id
gnis:County  =>  gnis:county_name
gnis:County_num  =>  gnis:county_id
gnis:County  =>  gnis:county_name
gnis:County_num  =>  gnis:county_id
gnis:County  =>  gnis:county_name
gnis:County_num  =>  gnis:county_id
gnis:County  =>  gnis:county_name
gnis:County_num  =>  gnis:county_id
gnis:County  =>  gni

### Updating phone numbers

In [11]:
### Changing phone numbers into '+1-###-###-####'  format

# Reference: http://www.diveintopython.net/regular_expressions/phone_numbers.html
import re

'''
Good phone nummber example
'+1-503-238-6330'
Problematic phone number examples =
['503-238-6330','(360) 258-1713', '+01-503-639-1712', '5032535327', '503.236.2970']
'''
good_phone_format = re.compile(r'^\+1\-(\d{3})-(\d{3})-(\d{4})$')

print good_phone_format.search('+1-503-238-6330')
print good_phone_format.search('503-238-6330')
print good_phone_format.search('(360) 258-1713')    
print good_phone_format.search('+01-503-639-1712') #largest number of digits accepted (number of digits =12)   
print good_phone_format.search('5032535327')   
print good_phone_format.search('503.236.2970')

phone_format = re.compile(r'(\d{3})\D*(\d{3})\D*(\d{4})') # $ at the end was removed

print phone_format.search('503-238-6330').groups() 
print phone_format.search('(360) 258-1713').groups()    
print phone_format.search('+01-503-639-1712').groups()    
print phone_format.search('5032535327').groups()    
print phone_format.search('503.236.2970').groups()    

def update_phone(phone_number, good_phone_format, phone_format):
    if not good_phone_format.search(phone_number):
        if 10 <= sum(char.isdigit() for char in phone_number) <= 12:  #only correct numbers with proper digits
            searched = phone_format.search(phone_number)
            if searched is not None: # If searched is None, it makes an error
                number_parsed = searched.groups()  
                phone_number = '+1-' + number_parsed[0]+'-'+number_parsed[1]+'-'+ number_parsed[2]
            else: 
                print 'not good, but uncorrected:', phone_number
        else:
            print 'not good, but uncorrected:', phone_number
           
    return phone_number

print update_phone('+1-503-238-6330', good_phone_format, phone_format),'was already good'
print update_phone('503-238-6330', good_phone_format, phone_format)
print update_phone('(360) 258-1713', good_phone_format, phone_format)    
print update_phone('+01-503-639-1712', good_phone_format, phone_format)    
print update_phone('5032535327', good_phone_format, phone_format)   
print update_phone('503.236.2970', good_phone_format, phone_format)

print update_phone('503-683-5359 Fax: 503-683-4913', good_phone_format, phone_format),'was not corrected'


def test():
    for _, element in ET.iterparse('portland_oregon_sample_k10.osm'):
        for child in element.findall('tag'):
            kvalue = child.attrib["k"]
            if kvalue.find('phone') != -1:  #shoud be checked after 'Phone' is corrected to 'phone'
                better_number = update_phone(child.attrib["v"],good_phone_format, phone_format)
                if better_number != child.attrib["v"]:
                    print kvalue, child.attrib["v"], " => ", better_number

if __name__ == '__main__':
    test()


<_sre.SRE_Match object at 0x00000000048CEF08>
None
None
None
None
None
('503', '238', '6330')
('360', '258', '1713')
('503', '639', '1712')
('503', '253', '5327')
('503', '236', '2970')
+1-503-238-6330 was already good
+1-503-238-6330
+1-360-258-1713
+1-503-639-1712
+1-503-253-5327
+1-503-236-2970
not good, but uncorrected: 503-683-5359 Fax: 503-683-4913
503-683-5359 Fax: 503-683-4913 was not corrected
phone +1 (503) 282-9603  =>  +1-503-282-9603
phone +01-503-639-1712  =>  +1-503-639-1712
phone (503) 281-0594  =>  +1-503-281-0594
phone (360) 835-8860  =>  +1-360-835-8860
phone (503) 657-8269  =>  +1-503-657-8269
phone (503) 245-3857  =>  +1-503-245-3857
phone 503-543-6515  =>  +1-503-543-6515
phone 5032535327  =>  +1-503-253-5327
phone (360) 258-1713  =>  +1-360-258-1713
phone 503-912-0334  =>  +1-503-912-0334
phone 503-208-2021  =>  +1-503-208-2021
phone 503.223.4984  =>  +1-503-223-4984
phone 503.236.2970  =>  +1-503-236-2970
not good, but uncorrected: 503-683-5359    Fax: 503-683-4

In [89]:
print update_phone('+01 503 352 9306', good_phone_format, phone_format)
print update_phone('+01 503 352 9306 ‎', good_phone_format, phone_format)

+1-503-352-9306
+1-503-352-9306


Reference: http://www.diveintopython.net/regular_expressions/phone_numbers.html

### Cleaning a sample & Converting XML to CSV (with Validation)

In [30]:
%%time
### Modified Case Study scripts with validation ###
# Last check with small data

import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

import cerberus

import schema  # This is not a package. schema.py should be in the same folder

OSM_PATH = "portland_oregon_sample_k200.osm"

NODES_PATH = "nodes_sample_k200_updated_most.csv"
NODE_TAGS_PATH = "nodes_tags_k200_updated_most.csv"
WAYS_PATH = "ways_k200_updated_most.csv"
WAY_NODES_PATH = "ways_nodes_k200_updated_most.csv"
WAY_TAGS_PATH = "ways_tags_k200_updated_most.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

mapping = { "gnis:County_num":"gnis:county_id", "gnis:County":"gnis:county_name","Phone":"phone",
           "servic":"service","dogs": "dog"}

mapping_street = { "St": "Street",
            "St.": "Street",
            "Rd": "Road",
           "Rd.": "Road",
           "Ave": "Avenue"}

def update_name(name, mapping_street):
    for st_type in mapping_street:
        if name.endswith(st_type):
            print 'Before:', name
            name = name.replace(st_type, mapping_street[st_type])
            print 'After:', name
    return name

def update_kvalue(kvalue, mapping):
    if kvalue in mapping:
        print 'Before:', kvalue
        kvalue = mapping[kvalue]
        print 'After:', kvalue
    return kvalue

good_phone_format = re.compile(r'^\+1\-(\d{3})-(\d{3})-(\d{4})$')
phone_format = re.compile(r'(\d{3})\D*(\d{3})\D*(\d{4})')

def update_phone(phone_number, good_phone_format, phone_format):
    if not good_phone_format.search(phone_number):
        if 10 <= sum(char.isdigit() for char in phone_number) <= 12:  #only correct numbers with proper digits
            searched = phone_format.search(phone_number)
            if searched is not None: # If searched is None, it makes an error
                print 'Before:', phone_number
                number_parsed = searched.groups()  
                phone_number = '+1-' + number_parsed[0]+'-'+number_parsed[1]+'-'+ number_parsed[2]
                print 'After:', phone_number
            else: 
                print 'not good, but uncorrected:', phone_number
        else:
            print 'not good, but uncorrected:', phone_number
           
    return phone_number

def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    # YOUR CODE HERE
    # maping tags
    for child in element.findall('tag'):   #this step is same for node and way
        if problem_chars.search(child.attrib['k']):
            continue
        else:
            tag_dic ={}
            
            better_kvalue = update_kvalue(child.attrib['k'], mapping) # update kvalues  
            
            colon_location = better_kvalue.find(":")   #find the first colon
            if colon_location == -1:  #no colon
                tag_dic['key'] = better_kvalue
                tag_dic['type'] = default_tag_type
            else:
                tag_dic['key'] = better_kvalue[colon_location+1:]
                tag_dic['type'] = better_kvalue[:colon_location]
            
            tag_dic['id'] = element.attrib['id']
            
            if better_kvalue.find('phone')>= 0:  #shoud be checked after updating kvalue
                 tag_dic['value'] = update_phone(child.attrib["v"],good_phone_format, phone_format)
            elif better_kvalue == "addr:street":     # updating street types
                tag_dic['value'] = update_name(child.attrib['v'],mapping_street)
            else:
                tag_dic['value'] = child.attrib['v']
            
            tags.append(tag_dic)

    if element.tag == 'node':
        for field in node_attr_fields:
            node_attribs[field]=element.attrib[field]
        return {'node': node_attribs, 'node_tags': tags}
        
    elif element.tag == 'way':
        for field in way_attr_fields:
            way_attribs[field]=element.attrib[field]
        
        index = 0
        for child in element.findall('nd'):
            waynode_dic ={}
            waynode_dic['id'] = element.attrib['id']
            waynode_dic['node_id'] = child.attrib['ref']
            waynode_dic['position'] = index
            index +=1
            
            way_nodes.append(waynode_dic)
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}


# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=True)


Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
not good, but uncorrected: 503-683-5359    Fax: 503-683-4913
Before: 503-972-8780
After: +1-503-972-8780
Wall time: 2min 19s


### Cleaning a bigger sample & Converting XML to CSV (without validation)

In [29]:
%%time
### Modified Case Study scripts without validation ###

import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

import cerberus

import schema  # This is not a package. schema.py should be in the same folder

OSM_PATH = "portland_oregon_sample_k10.osm"

NODES_PATH = "nodes_sample_k10_updated_most.csv"
NODE_TAGS_PATH = "nodes_tags_k10_updated_most.csv"
WAYS_PATH = "ways_k10_updated_most.csv"
WAY_NODES_PATH = "ways_nodes_k10_updated_most.csv"
WAY_TAGS_PATH = "ways_tags_k10_updated_most.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

mapping = { "gnis:County_num":"gnis:county_id", "gnis:County":"gnis:county_name","Phone":"phone",
           "servic":"service","dogs": "dog"}

mapping_street = { "St": "Street",
            "St.": "Street",
            "Rd": "Road",
           "Rd.": "Road",
           "Ave": "Avenue"}

def update_name(name, mapping_street):
    for st_type in mapping_street:
        if name.endswith(st_type):
            print 'Before:', name
            name = name.replace(st_type, mapping_street[st_type])
            print 'After:', name
    return name

def update_kvalue(kvalue, mapping):
    if kvalue in mapping:
        print 'Before:', kvalue
        kvalue = mapping[kvalue]
        print 'After:', kvalue
    return kvalue

good_phone_format = re.compile(r'^\+1\-(\d{3})-(\d{3})-(\d{4})$')
phone_format = re.compile(r'(\d{3})\D*(\d{3})\D*(\d{4})')

def update_phone(phone_number, good_phone_format, phone_format):
    if not good_phone_format.search(phone_number):
        if 10 <= sum(char.isdigit() for char in phone_number) <= 12:  #only correct numbers with proper digits
            searched = phone_format.search(phone_number)
            if searched is not None: # If searched is None, it makes an error
                print 'Before:', phone_number
                number_parsed = searched.groups()  
                phone_number = '+1-' + number_parsed[0]+'-'+number_parsed[1]+'-'+ number_parsed[2]
                print 'After:', phone_number
            else: 
                print 'not good, but uncorrected:', phone_number
        else:
            print 'not good, but uncorrected:', phone_number
           
    return phone_number

def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    # YOUR CODE HERE
    # maping tags
    for child in element.findall('tag'):   #this step is same for node and way
        if problem_chars.search(child.attrib['k']):
            continue
        else:
            tag_dic ={}
            
            better_kvalue = update_kvalue(child.attrib['k'], mapping) # update kvalues  
            
            colon_location = better_kvalue.find(":")   #find the first colon
            if colon_location == -1:  #no colon
                tag_dic['key'] = better_kvalue
                tag_dic['type'] = default_tag_type
            else:
                tag_dic['key'] = better_kvalue[colon_location+1:]
                tag_dic['type'] = better_kvalue[:colon_location]
            
            tag_dic['id'] = element.attrib['id']
            
            if better_kvalue.find('phone')>= 0:  #shoud be checked after updating kvalue
                 tag_dic['value'] = update_phone(child.attrib["v"],good_phone_format, phone_format)
            elif better_kvalue == "addr:street":     # updating street types
                tag_dic['value'] = update_name(child.attrib['v'],mapping_street)
            else:
                tag_dic['value'] = child.attrib['v']
            
            tags.append(tag_dic)

    if element.tag == 'node':
        for field in node_attr_fields:
            node_attribs[field]=element.attrib[field]
        return {'node': node_attribs, 'node_tags': tags}
        
    elif element.tag == 'way':
        for field in way_attr_fields:
            way_attribs[field]=element.attrib[field]
        
        index = 0
        for child in element.findall('nd'):
            waynode_dic ={}
            waynode_dic['id'] = element.attrib['id']
            waynode_dic['node_id'] = child.attrib['ref']
            waynode_dic['position'] = index
            index +=1
            
            way_nodes.append(waynode_dic)
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}


# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate= False )


Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gn

### Cleaning the original data & Converting XML to CSV

In [31]:
%%time
### Modified Case Study scripts without validation (validate=False) ###

import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

import cerberus

import schema  # This is not a package. schema.py should be in the same folder

OSM_PATH = "portland_oregon.osm"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

mapping = { "gnis:County_num":"gnis:county_id", "gnis:County":"gnis:county_name","Phone":"phone",
           "servic":"service","dogs": "dog"}

mapping_street = { "St": "Street",
            "St.": "Street",
            "Rd": "Road",
           "Rd.": "Road",
           "Ave": "Avenue"}

def update_name(name, mapping_street):
    for st_type in mapping_street:
        if name.endswith(st_type):
            print 'Before:', name
            name = name.replace(st_type, mapping_street[st_type])
            print 'After:', name
    return name

def update_kvalue(kvalue, mapping):
    if kvalue in mapping:
        print 'Before:', kvalue
        kvalue = mapping[kvalue]
        print 'After:', kvalue
    return kvalue

good_phone_format = re.compile(r'^\+1\-(\d{3})-(\d{3})-(\d{4})$')
phone_format = re.compile(r'(\d{3})\D*(\d{3})\D*(\d{4})')

def update_phone(phone_number, good_phone_format, phone_format):
    if not good_phone_format.search(phone_number):
        if 10 <= sum(char.isdigit() for char in phone_number) <= 12:  #only correct numbers with proper digits
            searched = phone_format.search(phone_number)
            if searched is not None: # If searched is None, it makes an error
                print 'Before:', phone_number
                number_parsed = searched.groups()  
                phone_number = '+1-' + number_parsed[0]+'-'+number_parsed[1]+'-'+ number_parsed[2]
                print 'After:', phone_number
            else: 
                print 'not good, but uncorrected:', phone_number
        else:
            print 'not good, but uncorrected:', phone_number
           
    return phone_number

def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    # YOUR CODE HERE
    # maping tags
    for child in element.findall('tag'):   #this step is same for node and way
        if problem_chars.search(child.attrib['k']):
            continue
        else:
            tag_dic ={}
            
            better_kvalue = update_kvalue(child.attrib['k'], mapping) # update kvalues  
            
            colon_location = better_kvalue.find(":")   #find the first colon
            if colon_location == -1:  #no colon
                tag_dic['key'] = better_kvalue
                tag_dic['type'] = default_tag_type
            else:
                tag_dic['key'] = better_kvalue[colon_location+1:]
                tag_dic['type'] = better_kvalue[:colon_location]
            
            tag_dic['id'] = element.attrib['id']
            
            if better_kvalue.find('phone')>= 0:  #shoud be checked after updating kvalue
                 tag_dic['value'] = update_phone(child.attrib["v"],good_phone_format, phone_format)
            elif better_kvalue == "addr:street":     # updating street types
                tag_dic['value'] = update_name(child.attrib['v'],mapping_street)
            else:
                tag_dic['value'] = child.attrib['v']
            
            tags.append(tag_dic)

    if element.tag == 'node':
        for field in node_attr_fields:
            node_attribs[field]=element.attrib[field]
        return {'node': node_attribs, 'node_tags': tags}
        
    elif element.tag == 'way':
        for field in way_attr_fields:
            way_attribs[field]=element.attrib[field]
        
        index = 0
        for child in element.findall('nd'):
            waynode_dic ={}
            waynode_dic['id'] = element.attrib['id']
            waynode_dic['node_id'] = child.attrib['ref']
            waynode_dic['position'] = index
            index +=1
            
            way_nodes.append(waynode_dic)
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}


# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=False)


Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gnis:County
After: gnis:county_name
Before: gnis:County_num
After: gnis:county_id
Before: gn

## 3. Creating database from CSV files

Provided Schema: https://gist.github.com/swwelch/f1144229848b407e0a5d13fcb7fbbd6f

### nodes table

In [None]:
import sqlite3
import csv
from pprint import pprint

sqlite_file = 'OpenStreetMap_Portland.db'

# Connect to the database
conn = sqlite3.connect(sqlite_file)

# Get a cursor object
cur = conn.cursor()
cur.execute('DROP TABLE IF EXISTS nodes')
conn.commit()

# Create the table with schema
cur.execute('''
    CREATE TABLE nodes(
    id INTEGER PRIMARY KEY NOT NULL,
    lat REAL,
    lon REAL,
    user TEXT,
    uid INTEGER,
    version INTEGER,
    changeset INTEGER,
    timestamp TEXT)
''')
# commit the changes
conn.commit()

# Read in the csv file as a dictionary, format the data as a list of tuples:
with open('nodes.csv','rb') as fin:
    dr = csv.DictReader(fin) # comma is default delimiter
    to_db = [(i['id'], i['lat'],i['lon'], i['user'].decode("utf-8"), i['uid'], i['version'], i['changeset'], i['timestamp']) for i in dr]
    
# insert the formatted data
cur.executemany("INSERT INTO nodes(id, lat, lon,user,uid,version,changeset,timestamp) VALUES (?, ?, ?, ?, ?, ?, ?, ?);", to_db)
# commit the changes
conn.commit()

# # check that the data imported correctly
# cur.execute('SELECT * FROM nodes')
# all_rows = cur.fetchall()
# print('1):')
# pprint(all_rows)

conn.close()

### ways table

In [None]:
import sqlite3
import csv
from pprint import pprint

sqlite_file = 'OpenStreetMap_Portland.db'

# Connect to the database
conn = sqlite3.connect(sqlite_file)

# Get a cursor object
cur = conn.cursor()
cur.execute('DROP TABLE IF EXISTS ways')
conn.commit()

# Create the table with schema
cur.execute('''
    CREATE TABLE ways (
    id INTEGER PRIMARY KEY NOT NULL,
    user TEXT,
    uid INTEGER,
    version TEXT,
    changeset INTEGER,
    timestamp TEXT)
''')
# commit the changes
conn.commit()

# Read in the csv file as a dictionary, format the data as a list of tuples:
with open('ways.csv','rb') as fin:
    dr = csv.DictReader(fin) # comma is default delimiter
    to_db = [(i['id'], i['user'].decode("utf-8"), i['uid'], i['version'], i['changeset'], i['timestamp']) for i in dr]
    
# insert the formatted data
cur.executemany("INSERT INTO ways(id,user,uid,version,changeset,timestamp) VALUES (?, ?, ?, ?, ?, ?);", to_db)
# commit the changes
conn.commit()

# # check that the data imported correctly
# cur.execute('SELECT * FROM ways')
# all_rows = cur.fetchall()
# print('1):')
# pprint(all_rows)

conn.close()

### nodes_tags table

In [None]:
import sqlite3
import csv
from pprint import pprint

sqlite_file = 'OpenStreetMap_Portland.db'

# Connect to the database
conn = sqlite3.connect(sqlite_file)

# Get a cursor object
cur = conn.cursor()
cur.execute('DROP TABLE IF EXISTS nodes_tags')
conn.commit()

# Create the table with schema
cur.execute('''
    CREATE TABLE nodes_tags(
        id INTEGER,
        key TEXT,
        value TEXT,
        type TEXT,
        FOREIGN KEY (id) REFERENCES nodes(id))
''')
# commit the changes
conn.commit()

# Read in the csv file as a dictionary, format the data as a list of tuples:
with open('nodes_tags.csv','rb') as fin:
    dr = csv.DictReader(fin) # comma is default delimiter
    to_db = [(i['id'], i['key'],i['value'].decode("utf-8"), i['type']) for i in dr]
    
# insert the formatted data
cur.executemany("INSERT INTO nodes_tags(id, key,value,type) VALUES (?, ?, ?, ?);", to_db)
# commit the changes
conn.commit()

# # check that the data imported correctly
# cur.execute('SELECT * FROM nodes_tags')
# all_rows = cur.fetchall()
# print('1):')
# pprint(all_rows)

conn.close()

### ways_tags table

In [None]:
import sqlite3
import csv
from pprint import pprint

sqlite_file = 'OpenStreetMap_Portland.db'

# Connect to the database
conn = sqlite3.connect(sqlite_file)

# Get a cursor object
cur = conn.cursor()
cur.execute('DROP TABLE IF EXISTS ways_tags')
conn.commit()

# Create the table with schema
cur.execute('''
    CREATE TABLE ways_tags(
        id INTEGER NOT NULL,
        key TEXT NOT NULL,
        value TEXT NOT NULL,
        type TEXT,
        FOREIGN KEY (id) REFERENCES ways(id))
''')
# commit the changes
conn.commit()

# Read in the csv file as a dictionary, format the data as a list of tuples:
with open('ways_tags.csv','rb') as fin:
    dr = csv.DictReader(fin) # comma is default delimiter
    to_db = [(i['id'], i['key'],i['value'].decode("utf-8"), i['type']) for i in dr]
    
# insert the formatted data
cur.executemany("INSERT INTO ways_tags(id, key,value,type) VALUES (?, ?, ?, ?);", to_db)
# commit the changes
conn.commit()

# # check that the data imported correctly
# cur.execute('SELECT * FROM ways_tags')
# all_rows = cur.fetchall()
# print('1):')
# pprint(all_rows)

conn.close()

### ways_nodes table

In [None]:
import sqlite3
import csv
from pprint import pprint

sqlite_file = 'OpenStreetMap_Portland.db'

# Connect to the database
conn = sqlite3.connect(sqlite_file)

# Get a cursor object
cur = conn.cursor()
cur.execute('DROP TABLE IF EXISTS ways_nodes')
conn.commit()

# Create the table with schema
cur.execute('''
    CREATE TABLE ways_nodes(
        id INTEGER NOT NULL,
        node_id INTEGER NOT NULL,
        position INTEGER NOT NULL,
        FOREIGN KEY (id) REFERENCES ways(id),
        FOREIGN KEY (node_id) REFERENCES nodes(id))
''')
# commit the changes
conn.commit()

# Read in the csv file as a dictionary, format the data as a list of tuples:
with open('ways_nodes.csv','rb') as fin:
    dr = csv.DictReader(fin) # comma is default delimiter
    to_db = [(i['id'], i['node_id'],i['position']) for i in dr]
    
# insert the formatted data
cur.executemany("INSERT INTO ways_nodes(id, node_id, position) VALUES (?, ?, ?);", to_db)
# commit the changes
conn.commit()

# # check that the data imported correctly
# cur.execute('SELECT * FROM ways_nodes')
# all_rows = cur.fetchall()
# print('1):')
# pprint(all_rows)

conn.close()