In [18]:
# load libraries
from collections import defaultdict
import re
import os
import pprint
import datetime as dt
import csv
import pandas as pd
import xml.etree.cElementTree as cET

#Load OSM file and path, in this case we are looking at New Orleans, LA, USA
filename = "new-orleans_louisiana.osm"
path = "/Users/matthew/anaconda/envs/udacity_p2/projects/udacity_wrangle_openstreetmap_data/map"
osmFile = os.path.join(path, filename)

In [27]:
#count the number of unique elements in our file to get a picture of the data structure
def count_tags(osm):
        tags = {}
        for event, elem in cET.iterparse(osm):
            if elem.tag in tags: 
                tags[elem.tag] += 1
            else:
                tags[elem.tag] = 1
        return tags
#execute count_tags on our osm file
tags = count_tags(osmFile)
#print the results
pprint.pprint(tags)

{'bounds': 1,
 'member': 68224,
 'nd': 7124566,
 'node': 6413023,
 'osm': 1,
 'relation': 5963,
 'tag': 1664974,
 'way': 378498}


In [28]:
#get the users that have contributed to this OSM file
def get_users(osm):
    users = set()
    for _, element in cET.iterparse(osm):
        for e in element:
            if 'uid' in e.attrib:
                users.add(e.attrib['uid'])
    return users
users = get_users(osmFile)
len(users)

899

In [29]:
lower_case = re.compile(r'^([a-z]|_)*$')
colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
badchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

#search tags for three types of elements, ones that value values that are all lowercase and are valid, ones that are valid but have colons, and one that looks for bad or problem characters
def key_type(element, keys):
    if element.tag == "tag":
        for tag in element.iter('tag'):
            k = tag.get('k')
            if lower_case.search(k):
                keys['lower_case'] += 1
            elif colon.search(k):
                keys['colon'] += 1
            elif badchars.search(k):
                keys['badchars'] += 1
            else:
                keys['other'] += 1
    return keys


def search_keys(osm):
    keys = {"lower_case": 0, "colon": 0, "badchars": 0, "other": 0}
    for _, element in cET.iterparse(osm):
        keys = key_type(element, keys)

    return keys

keys = search_keys(osmFile)
pprint.pprint(keys)

{'badchars': 3, 'colon': 690509, 'lower_case': 659110, 'other': 315352}


In [39]:
#Clean up street names so they all use the same abbreviation, print out the ones we updated.
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ["Avenue", "Boulevard", "Commons", "Court", "Drive", "Lane", "Parkway", 
                         "Place", "Road", "Square", "Street", "Trail"]

mapping = {'Ave'  : 'Avenue',
           'Blvd' : 'Boulevard',
           'Dr'   : 'Drive',
           'Ln'   : 'Lane',
           'Pkwy' : 'Parkway',
           'Rd'   : 'Road',
           'Rd.'   : 'Road',
           'St'   : 'Street',
           'street' :"Street",
           'Ct'   : "Court",
           'Cir'  : "Circle",
           'Cr'   : "Court",
           'ave'  : 'Avenue',
           'Hwg'  : 'Highway',
           'Hwy'  : 'Highway',
           'Sq'   : "Square"}

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in cET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])

    return street_types

def update_street_name(name, mapping, regex):
    m = regex.search(name)
    if m:
        street_type = m.group()
        if street_type in mapping:
            name = re.sub(regex, mapping[street_type], name)

    return name

streets = audit(osmFile)

for street_type, ways in streets.iteritems():
    for name in ways:
        updated_name = update_street_name(name, mapping, street_type_re)
        if updated_name != name :
            print name, "=>", updated_name
        

Banks St => Banks Street
Octavia St => Octavia Street
S Broad St => S Broad Street
Ponce De Leon St => Ponce De Leon Street
Chartres St => Chartres Street
Magazine Street;Magazine St => Magazine Street;Magazine Street
East St => East Street
E Rutland St => E Rutland Street
North St => North Street
Marais St => Marais Street
Laurel St => Laurel Street
N Rendon St => N Rendon Street
Magazine St => Magazine Street
Lasalle St => Lasalle Street
Tchoupitoulas St => Tchoupitoulas Street
South St => South Street
Toulouse St => Toulouse Street
Dauphine St => Dauphine Street
Decatur St => Decatur Street
543 5th St => 543 5th Street
Royal St => Royal Street
Bourbon St => Bourbon Street
Canal St => Canal Street
South St Patrick St => South St Patrick Street
Whitney St => Whitney Street
S Dupre St => S Dupre Street
Prytania St => Prytania Street
Johnny Dufrene St => Johnny Dufrene Street
Bayou Rd => Bayou Road
West I-10 Service Rd => West I-10 Service Road
Cobia Dr => Cobia Drive
Acadia Dr => Acadi

In [41]:
def audit_zipcode(invalid_zipcodes, zipcode):
    twoDigits = zipcode[0:2]
    
    if not twoDigits.isdigit():
        invalid_zipcodes[twoDigits].add(zipcode)
    
    elif twoDigits != 95:
        invalid_zipcodes[twoDigits].add(zipcode)
        
def is_zipcode(elem):
    return (elem.attrib['k'] == "addr:postcode")

def audit_zip(osmfile):
    osm_file = open(osmfile, "r")
    invalid_zipcodes = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_zipcode(tag):
                    audit_zipcode(invalid_zipcodes,tag.attrib['v'])

    return invalid_zipcodes

def update_zips(zipcode):
    testNum = re.findall('[a-zA-Z]*', zipcode)
    if testNum:
        testNum = testNum[0]
    testNum.strip()
    if testNum == "LA":
        convertedZipcode = (re.findall(r'\d+', zipcode))
        if convertedZipcode:
            if convertedZipcode.__len__() == 2:
                return (re.findall(r'\d+', zipcode))[0] + "-" +(re.findall(r'\d+', zipcode))[1]
            else:
                return (re.findall(r'\d+', zipcode))[0]


zipcodes = audit_zip(osmFile)

for street_type, ways in zipcodes.iteritems():
    for zipcode in ways:
        updated_zipcode = update_zips(zipcode)
        if zipcode != updated_zipcode :
            print zipcode, "=>", updated_zipcode

63525 => None
39529 => None
39574 => None
39576 => None
39571 => None
39556 => None
39560 => None
39466 => None
39572 => None
39520 => None
39573 => None
39525 => None
39501 => None
39503 => None
LA 70116 => 70116
LA 70117 => 70117
70345 => None
70070 => None
70448 => None
70394 => None
70403 => None
70458 => None
70083 => None
70420 => None
70421 => None
70032 => None
70094 => None
70131 => None
70114 => None
70053 => None
70135 => None
70130-3890 => None
70115 => None
70116 => None
70117 => None
70112 => None
70113 => None
70037 => None
70401 => None
70118 => None
70119 => None
70058 => None
70471 => None
70072 => None
70056 => None
70130 => None
70128 => None
70433 => None
70357 => None
70354 => None
70003 => None
70434 => None
70446 => None
70041 => None
70447 => None
70170 => None
70454 => None
70012 => None
70445 => None
70002 => None
70065 => None
70001 => None
70006 => None
70039 => None
70062 => None
70005 => None
70068 => None
70112-2625 => None
70460 => None
70461 => None
70