# P3: Wrangle OpenStreetMap Data

## Data

The map area I chose is the Austin, TX area. As delineated in the class, I obtained the data by downloading an already-prepared extract which I found in the link below:

https://mapzen.com/data/metro-extracts/metro/austin_texas/

I chose the 66MB raw OpenStreetMap OSM XML dataset. After unzipping the file, it gave about 1.4 GB dataset. Opening this dataset using Sublime took a while.

### Preliminary examination of the dataset

This is done to see how the data looks like.

In [1]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re

In [2]:
street_type_re = re.compile(r'\S+\.?$', re.IGNORECASE)
street_types = defaultdict(int)

In [3]:
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        street_types[street_type] += 1

In [4]:
def print_sorted_dict(d):
    keys = d.keys()
    keys = sorted(keys, key=lambda s: s.lower())
    for k in keys:
        v = d[k]
        print "%s: %d" % (k,v)

In [5]:
def is_street_name(elem):
    return (elem.tag == "tag") and (elem.attrib['k'] == "addr:street")

In [6]:
osmfile = "austin_texas.osm"

In [9]:
for event, element in ET.iterparse(osmfile):
    if is_street_name(element):
        audit_street_type(street_types, element.attrib['v'])
print_sorted_dict(street_types)

#100: 2
#101: 1
#104: 1
#150: 1
#203: 2
#260: 1
#300: 2
#3000a: 1
#306: 1
#4: 1
#406: 1
#600: 1
#8: 1
#B100: 1
#F-4: 1
#G-145: 1
#L2: 1
100: 2
104: 1
1100: 45
117: 1
12: 8
120: 1
129: 11
1327: 61
138: 3
1431: 121
150: 2
1625: 76
1626: 91
163: 1
170: 1
1805: 1
1825: 1
1826: 57
183: 7
213: 1
2222: 68
2243: 2
2244: 1
275: 1
2769: 163
280: 3
290: 333
298: 1
3: 1
301: 2
3177: 1
320: 1
35: 25
400: 1
414: 1
45: 1
452: 1
459: 6
535: 2
6: 1
619: 1
620: 551
685: 5
7: 1
71: 17
79: 1
8: 1
812: 176
969: 2
973: 170
A: 76
A-15: 1
A500: 1
Acres: 16
Adventurer: 2
Affirmed: 7
Alley: 44
Alps: 15
Alto: 28
Amistad: 26
Apache: 6
Arbolago: 21
Arrow: 17
Atlantic: 11
Austin: 1
Ave: 33
Ave.: 1
Avene: 1
Avenue: 15891
B: 105
Barrhead: 12
Bend: 1777
Birch: 12
Blackfoot: 7
Bluff: 41
Blvd: 25
Blvd.: 6
Boggy: 4
Bonanza: 20
Bonita: 18
Bottom: 1
Boulevard: 8759
Branch: 17
Bridge: 26
Buckskin: 1
C: 127
C-200: 1
C1-100: 1
Caliche: 5
Calle: 24
Camelback: 6
Camino: 27
Cannon: 1
Cantera: 11
Canterwood: 27
Canyon: 79
Capri: 

# Quiz

In [15]:
tag_counts = defaultdict(int)

In [17]:
for event, elem in ET.iterparse(austinosm):
    tag_counts[elem.tag] +=1

In [18]:
tag_counts

defaultdict(int,
            {'bounds': 1,
             'member': 20197,
             'nd': 6985591,
             'node': 6356394,
             'osm': 1,
             'relation': 2357,
             'tag': 2377504,
             'way': 666390})

In [30]:
keys = defaultdict(int)

In [31]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

In [32]:
for _, element in ET.iterparse(austinosm):
    if element.tag == 'tag':
        try:
            lower.search(element.attrib['k']).group()
            keys["lower"] += 1
        except AttributeError:
            try:
                lower_colon.search(element.attrib['k']).group()
                keys["lower_colon"] += 1
            except AttributeError:
                try:
                    problemchars.search(element.attrib['k']).group()
                    keys["problemchars"] += 1
                except AttributeError:
                    keys["others"] += 1

In [33]:
keys

defaultdict(int,
            {'lower': 1297812,
             'lower_colon': 1067727,
             'others': 11964,
             'problemchars': 1})

# Users

In [34]:
osm_file = open(austinosm, 'r')

In [35]:
users = set()

In [37]:
for _, element in ET.iterparse(osm_file):
    for key in element.attrib:
        if key == "uid":
            users.add(element.attrib[key])
print users

set(['1917687', '3057995', '2073337', '838709', '74705', '2510226', '152074', '364243', '432828', '113450', '45027', '22461', '3974971', '164683', '107681', '353043', '11547', '4171770', '1836471', '1723831', '2957217', '1822355', '2988060', '2943834', '28852', '2648255', '1602534', '3269109', '4430311', '6367', '252811', '2652160', '80285', '201359', '445917', '364400', '93788', '8909', '96380', '437598', '2942042', '508802', '2813997', '415250', '597278', '3990', '4559658', '663559', '2330541', '2103924', '2411240', '94578', '1970302', '510836', '2929338', '3341346', '3233620', '4716815', '119748', '1940439', '2318', '2306749', '247879', '808770', '2253926', '161371', '4416224', '177640', '179600', '870861', '226191', '142205', '3395279', '70696', '3311414', '47892', '1314413', '111159', '38487', '2552406', '2272717', '47544', '874213', '2890496', '1090081', '2010493', '61855', '118021', '2944689', '1962804', '616774', '1841822', '1745406', '4692089', '687044', '105002', '2406578', '

In [38]:
len(users)

1155

# Auditing and Improving Street Names 

In [39]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

In [40]:
osmfile = "austin_texas.osm"

In [41]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

In [45]:
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Cove", "Highway", "IH-35", "Lane", "North", "Overlook", "Pass"]

In [52]:
mapping = { "St": "Street",
            "St.": "Street",
            "st": "Street",
            "street": "Street",
            "Ave": "Avenue",
            "Ave.": "Avenue",
            "Avene": "Avenue",
            "Blvd": "Boulevard",
            "Blvd.": "Boulevard",
            "Dr": "Drive",
            "Dr.": "Drive",
            "Ct": "Court",
            "Ct.": "Court",
            "court": "Court",
            "Cv": "Cove",
            "cv": "Cove",
            "Pl": "Place",
            "Pl.": "Place",
            "lane": "Lane",
            "Ln": "Lane",
            "Rd": "Road", 
            "Rd.": "Road",
            "Trl": "Trail",
            "Pkwy": "Parkway",
            "Hwy": "Highway",
            "I35": "IH-35",
            "IH35": "IH-35",
            "IH35,": "IH-35",
            "lane": "Lane",
            "N": "North",
            "Ovlk": "Overlook",
            "pass": "Pass",
            "W": "West"}

In [53]:
def audit_street_type(stree_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_name = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

In [54]:
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

In [55]:
def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

In [56]:
def update_name(name, mapping):
    parts = name.split()
    if parts[-1] in mapping.keys():
        parts[-1] = mapping[parts[-1]]
    name = ' '.join(parts)
    return name

In [57]:
st_types = audit(osmfile)

NameError: global name 'street_type' is not defined

In [None]:
pprint.pprint(dict(st_types))