# Case Study Quizzes Applied to Cedar Park osm File

### A look back at the blueprint for cleaning osm files
(See Data Cleaning)

In [1]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re

##### osm file obtained for Cedar Park, TX

In [2]:
filename = "ex_hACvaQWTKpAQzk11ZsVUFhXE8ExV7.osm"

##### Regex : sequence of nonwhite space characters otpoionally followed by a period, must occur at the end fo string


In [3]:
street_type_re = re.compile(r'\S+\.?$', re.IGNORECASE)
street_types = defaultdict(int)

In [4]:
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        street_types[street_type] += 1

In [5]:
def print_sorted_dict(d):
    keys = d.keys()
    keys = sorted(keys, key=lambda s: s.lower())
    for k in keys:
        v = d[k]
        print "%s: %d" % (k,v)

In [6]:
def is_street_name(elem):
    return (elem.tag == "tag") and (elem.attrib['k'] == "addr:street")

In [7]:
def audit():
    for event, elem in ET.iterparse(osmfile):
        if is_street_name(elem):
            audit_street_type(street_types, elem.attrib['v'])
    print_sorted_dict(street_types)

In [8]:
osmfile = open(filename, "r")

In [9]:
if __name__ == '__main__':
    audit()

#100: 1
#406: 1
#G-145: 1
1431: 6
183: 4
2769: 19
301: 1
400: 1
620: 124
Avenue: 188
Barrhead: 12
Bend: 248
Blvd: 5
Boulevard: 186
Camelback: 6
Cantera: 11
Canyon: 67
Casitas: 79
Circle: 301
Costa: 6
Court: 542
Cove: 1064
Ct: 36
Dalmahoy: 10
Dance: 93
Dancer: 6
Dr: 101
Drive: 7531
Dublin: 4
Edenderry: 13
Folkway: 66
Ford: 1
Fork: 3
Fort: 2
Gap: 25
Garden: 7
Hill: 33
Hollow: 26
Horn: 11
Lajitas: 30
Lane: 1784
Ln: 48
Lonesome: 15
Loop: 234
Meadows: 11
Mirador: 6
Mirage: 6
North: 45
Park: 4
Parkway: 251
Pass: 105
Path: 56
Pawnee: 6
Place: 194
Print: 4
Rd: 1
Road: 826
Rock: 3
Run: 57
Sky: 6
Spur: 5
Square: 6
Street: 616
Sundown: 22
Terrace: 16
Thunder: 17
Tiempo: 19
Trace: 14
Trail: 3018
Verdes: 48
View: 1
Vista: 21
Way: 243
Wow: 2


In [13]:
osmfile.close()

### Checking what top level tags are there:

In [1]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re

In [2]:
filename = "ex_hACvaQWTKpAQzk11ZsVUFhXE8ExV7.osm"

In [3]:
osmfile = open(filename, "r")

In [4]:
tag_counts = defaultdict(int)

In [5]:
for event, elem in ET.iterparse(osmfile):
    tag_counts[elem.tag] += 1

In [6]:
tag_counts

defaultdict(int,
            {'bounds': 1,
             'member': 562,
             'nd': 732834,
             'node': 662875,
             'osm': 1,
             'relation': 83,
             'tag': 191628,
             'way': 66435})

### Tag Types

"""
Your task is to explore the data a bit more.
Before you process the data and add it into your database, you should check the
"k" value for each "<tag>" and see if there are any potential problems.

We have provided you with 3 regular expressions to check for certain patterns
in the tags. As we saw in the quiz earlier, we would like to change the data
model and expand the "addr:street" type of keys to a dictionary like this:
{"address": {"street": "Some value"}}
So, we have to see if we have such tags, and if we have any tags with
problematic characters.

Please complete the function 'key_type', such that we have a count of each of
four tag categories in a dictionary:
  "lower", for tags that contain only lowercase letters and are valid,
  "lower_colon", for otherwise valid tags with a colon in their names,
  "problemchars", for tags with problematic characters, and
  "other", for other tags that do not fall into the other three categories.
See the 'process_map' and 'test' functions for examples of the expected format.
"""

In [7]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

In [16]:
osm_file = open("ex_hACvaQWTKpAQzk11ZsVUFhXE8ExV7.osm", "r")

In [17]:
k = defaultdict(int)

In [18]:
for _, element in ET.iterparse(osm_file):
    if element.tag == 'tag':
        try:
            lower.search(element.attrib['k']).group()
            k["lower"] += 1
        except AttributeError:
            try:
                lower_colon.search(element.attrib['k']).group()
                k["lower_colon"] += 1
            except AttributeError:
                try:
                    problemchars.search(element.attrib['k']).group()
                    k["problemchars"] += 1
                except AttributeError:
                    k["others"] += 1

In [19]:
k

defaultdict(int, {'lower': 129124, 'lower_colon': 61757, 'others': 747})

### Exploring Users

In [20]:
users = set()

In [21]:
osm_file = open("ex_hACvaQWTKpAQzk11ZsVUFhXE8ExV7.osm", "r")

In [23]:
for _, element in ET.iterparse(osm_file):
    for key in element.attrib:
        if key == "uid":
            users.add(element.attrib[key])

In [25]:
print users

set(['1306', '13832', '77990', '113450', '131059', '2226712', '2330782', '3405475', '353043', '70696', '1836471', '2010493', '384700', '1822355', '422132', '2943834', '55916', '68982', '1725792', '1058397', '388279', '1110270', '360392', '2835928', '364400', '119881', '2082952', '2001899', '1679', '76002', '54759', '2941385', '406921', '3525854', '169004', '1887977', '110797', '27924', '4559658', '1329572', '189061', '270505', '626974', '2015224', '2929338', '22925', '2011339', '1883842', '4191259', '3341346', '3040373', '4732', '243003', '1376118', '1425774', '1424805', '2319962', '2512300', '8703', '520239', '3479270', '1554107', '2219338', '3316148', '1240849', '2944689', '611636', '47892', '293774', '171863', '38487', '1314413', '110263', '646671', '1814467', '874213', '97431', '574861', '60744', '2322261', '3017915', '1917615', '1745406', '1902905', '67862', '343553', '105002', '449777', '2406578', '42429', '36121', '2377377', '166129', '300459', '80285', '136520', '37137', '45169

In [26]:
len(users)

266

# Auditing and Improving Street Names 

Your task in this exercise has two steps:

- audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix 
    the unexpected street types to the appropriate ones in the expected list.
    You have to add mappings only for the actual problems you find in this OSMFILE,
    not a generalized solution, since that may and will depend on the particular area you are auditing.
- write the update_name function, to actually fix the street name.
    The function takes a string with street name as an argument and should return the fixed name
    We have provided a simple test so that you see what exactly is expected

In [1]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

In [2]:
filename = "ex_hACvaQWTKpAQzk11ZsVUFhXE8ExV7.osm"

In [3]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

In [4]:
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

In [5]:
mapping = { "Blvd": "Boulevard",
            "Dr": "Drive",
            "Ct": "Court",
            "Ln": "Lane",
            "Rd": "Road"
          }

In [6]:
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

In [7]:
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

In [8]:
def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

In [9]:
st_types = audit(filename)

In [11]:
pprint.pprint(st_types)

defaultdict(<type 'set'>, {'Vista': set(['Mira Vista']), '620': set(['RM 620', 'FM 620', 'N FM 620', 'Ranch Road 620', 'Ranch-to-Market Road 620', 'RR 620']), 'Edenderry': set(['Edenderry']), 'Ln': set(['Sky Ridge Ln', 'Hidden Hills Ln', 'Rocky Top Ln', 'Lantana Ln']), 'Thunder': set(['Buffalo Thunder']), 'Wow': set(['Pow Wow']), 'Verdes': set(['Palos Verdes']), 'Sky': set(['Cantina Sky']), 'Lajitas': set(['Lajitas']), 'Camelback': set(['Camelback']), 'Rd': set(['Barley Rd']), 'Hollow': set(['Pepper Mill Hollow', 'Brushy Hollow', 'Mill Hollow', 'Salt Mill Hollow']), 'Hill': set(['Coronado Spyglass Hill']), 'Way': set(['Azzuro Way', 'Wickett Way', 'Brindisi Way', 'Castellano Way', 'Appennini Way', 'Wendts Way', 'Benevento Way', 'Wagon Way', 'Lombardi Way', 'Earp Way', 'Balcones Way', 'Altona Way', 'Albania Way', 'Rias Way']), 'Print': set(['Paw Print']), 'Circle': set(['Brookwood Circle', 'South Ridge Circle', 'Laurel Creek Circle', 'Springmail Circle', 'Cedarhurst Circle', 'Waxwing Cir

In [12]:
def update_name(name, mapping):
    parts = name.split()
    if parts[-1] in mapping.keys():
        parts[-1] = mapping[parts[-1]]
    name = ' '.join(parts)
    return name

In [13]:
for st_type, ways in st_types.iteritems():
    for name in ways:
        better_name = update_name(name, mapping)
        print name, "=>", better_name

Mira Vista => Mira Vista
RM 620 => RM 620
FM 620 => FM 620
N FM 620 => N FM 620
Ranch Road 620 => Ranch Road 620
Ranch-to-Market Road 620 => Ranch-to-Market Road 620
RR 620 => RR 620
Edenderry => Edenderry
Sky Ridge Ln => Sky Ridge Lane
Hidden Hills Ln => Hidden Hills Lane
Rocky Top Ln => Rocky Top Lane
Lantana Ln => Lantana Lane
Buffalo Thunder => Buffalo Thunder
Pow Wow => Pow Wow
Palos Verdes => Palos Verdes
Cantina Sky => Cantina Sky
Lajitas => Lajitas
Camelback => Camelback
Barley Rd => Barley Road
Pepper Mill Hollow => Pepper Mill Hollow
Brushy Hollow => Brushy Hollow
Mill Hollow => Mill Hollow
Salt Mill Hollow => Salt Mill Hollow
Coronado Spyglass Hill => Coronado Spyglass Hill
Azzuro Way => Azzuro Way
Wickett Way => Wickett Way
Brindisi Way => Brindisi Way
Castellano Way => Castellano Way
Appennini Way => Appennini Way
Wendts Way => Wendts Way
Benevento Way => Benevento Way
Wagon Way => Wagon Way
Lombardi Way => Lombardi Way
Earp Way => Earp Way
Balcones Way => Balcones Way
Alt

### Preparing the Database - SQL