# Quiz 1

In [66]:
"""
Your task is to use the iterative parsing to process the map file and
find out not only what tags are there, but also how many, to get the
feeling on how much of which data you can expect to have in the map.
Fill out the count_tags function. It should return a dictionary with the 
tag name as the key and number of times this tag can be encountered in 
the map as value.

Note that your code will be tested with a different data file than the 'example.osm'
"""
import xml.etree.cElementTree as ET
import pprint
from collections import defaultdict
import sys

In [67]:
tags = defaultdict(lambda: 0)

for i, res in enumerate(ET.iterparse('data/map.osm')):
    event, element = res
    if event == 'end':
        tags[element.tag] += 1
        if i % 1000 == 0:
            sys.stdout.write('\r{} tags processed'.format(i))
        # print(element.tag)

2817000 tags processed

In [68]:
tags = dict(tags)

In [69]:
tags

{'bounds': 1,
 'member': 257472,
 'meta': 1,
 'nd': 353950,
 'node': 522491,
 'note': 1,
 'osm': 1,
 'relation': 7016,
 'tag': 1637428,
 'way': 39142}

In [70]:
def count_tags(filename):
    tags = defaultdict(lambda: 0)
    for i, res in enumerate(ET.iterparse(filename)):
        event, element = res
        if event == 'end':
            tags[element.tag] += 1
    return dict(tags)

In [71]:
def test():

    tags = count_tags('data/map.osm')
    pprint.pprint(tags)
    """assert tags == {'bounds': 1,
                     'member': 3,
                     'nd': 4,
                     'node': 20,
                     'osm': 1,
                     'relation': 1,
                     'tag': 7,
                     'way': 1}"""

    

if __name__ == "__main__":
    test()

{'bounds': 1,
 'member': 257472,
 'meta': 1,
 'nd': 353950,
 'node': 522491,
 'note': 1,
 'osm': 1,
 'relation': 7016,
 'tag': 1637428,
 'way': 39142}


# Quiz 2

In [72]:
import xml.etree.cElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
Before you process the data and add it into your database, you should check the
"k" value for each "<tag>" and see if there are any potential problems.

We have provided you with 3 regular expressions to check for certain patterns
in the tags. As we saw in the quiz earlier, we would like to change the data
model and expand the "addr:street" type of keys to a dictionary like this:
{"address": {"street": "Some value"}}
So, we have to see if we have such tags, and if we have any tags with
problematic characters.

Please complete the function 'key_type', such that we have a count of each of
four tag categories in a dictionary:
  "lower", for tags that contain only lowercase letters and are valid,
  "lower_colon", for otherwise valid tags with a colon in their names,
  "problemchars", for tags with problematic characters, and
  "other", for other tags that do not fall into the other three categories.
See the 'process_map' and 'test' functions for examples of the expected format.
"""

filename = 'data/map.osm'

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

In [73]:
with open(filename, 'r') as f:
    for i in range(50):
        text = f.readline()
        if 'tag' in text:
            print(text)
            print(lower.match(text))
            print(lower_colon.match(text))
            print(problemchars.search(text))

    <tag k="created_by" v="JOSM"/>

None
None
<_sre.SRE_Match object at 0x176a2bed0>
    <tag k="created_by" v="JOSM"/>

None
None
<_sre.SRE_Match object at 0x176a2bed0>
    <tag k="created_by" v="JOSM"/>

None
None
<_sre.SRE_Match object at 0x176a2bed0>


In [74]:
v = 'validchar'
vlc = 'valid_lower:colon'
prob = 'problem@ti_c'

In [75]:
print(lower.match(v))
print(lower_colon.match(v))
print(problemchars.search(v))

<_sre.SRE_Match object at 0x10de6f558>
None
None


In [76]:
print(lower.match(vlc))
print(lower_colon.match(vlc))
print(problemchars.search(vlc))

None
<_sre.SRE_Match object at 0x14eeb3c68>
None


In [77]:
print(lower.match(prob))
print(lower_colon.match(prob))
print(problemchars.search(prob))

None
None
<_sre.SRE_Match object at 0x176a2bcc8>


In [78]:
i = 0
for _, element in ET.iterparse(filename):
    if element.tag == 'tag':
        print(element.attrib['k'])
    i += 1
    if i > 10000:
        break

created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by
created_by

In [79]:
def key_type(element, keys):
    if element.tag == "tag":
        try:
            key = element.attrib['k']
        except:
            keys['other'] += 1
            return keys
        if lower.match(key):
            keys['lower'] += 1
        elif lower_colon.match(key):
            keys['lower_colon'] += 1
        elif problemchars.search(key):
            keys['problemchars'] += 1
        else:
            keys['other'] += 1
    return keys

In [80]:
def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys



def test():
    # You can use another testfile 'map.osm' to look at your solution
    # Note that the assertion below will be incorrect then.
    # Note as well that the test function here is only used in the Test Run;
    # when you submit, your code will be checked against a different dataset.
    keys = process_map('data/map.osm')
    pprint.pprint(keys)
    #assert keys == {'lower': 5, 'lower_colon': 0, 'other': 1, 'problemchars': 1}


if __name__ == "__main__":
    test()

{'lower': 197370, 'lower_colon': 1438747, 'other': 1311, 'problemchars': 0}


# Quiz 3

In [81]:
import xml.etree.cElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
The first task is a fun one - find out how many unique users
have contributed to the map in this particular area!

The function process_map should return a set of unique user IDs ("uid")
"""
pass

In [82]:
def get_user(element):
    try:
        user = element.attrib['user']
    except:
        user = None
    return user

In [83]:
def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        user = get_user(element)
        if user is not None:
            users.add(user)

    return users

In [84]:
def test():

    users = process_map('data/map.osm')
    pprint.pprint(users)
    # assert len(users) == 6
    
if __name__ == "__main__":
    test()

set(['-Schwarz-',
     '4b696d',
     '4rch',
     '673a',
     '@kevin_bullock',
     u'ALFREDO NU\xd1EZ',
     'AV4TAr',
     'Adrian Mvd',
     'Adrianuy',
     u'Adri\xe1n Hamburger',
     'AgusQui',
     u'Agust\xecn AT',
     'Airwolfless',
     'Aldo Coll',
     'Aleijo',
     'Alejandro Adrien',
     'Alejandro Prunell',
     u'Alejandro Rodr\xedguez Aguilera',
     'AlejandroAlba',
     'Aleks-Berlin',
     'Alexis del Puerto',
     'Alfa417',
     'AlfredoGMx',
     'Alpargator',
     'Aluzz',
     'Alvarit0',
     'Alvaro Rivoir',
     'AlvaroGMJ',
     'AndorinhaViajante',
     'Andre68',
     'Andres Alcarraz',
     'Andres Derderian',
     'Andres Pastorini',
     'AndresDuhour',
     'Angel Turissini',
     'AnthonyBost',
     'Antonieta',
     'Ari Rostkier',
     'ArizonaMapper',
     'Artu Kemanian',
     'Ashenzari',
     'AufzumHorizont',
     'BCNorwich',
     'Baconcrisp',
     'Basstoelpel',
     'Bettina Carrizo',
     'Bman',
     'Braulio Garcia',
     'BrunoX

# Quiz 4

In [85]:
"""
Your task in this exercise has two steps:

- audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix 
    the unexpected street types to the appropriate ones in the expected list.
    You have to add mappings only for the actual problems you find in this OSMFILE,
    not a generalized solution, since that may and will depend on the particular area you are auditing.
- write the update_name function, to actually fix the street name.
    The function takes a string with street name as an argument and should return the fixed name
    We have provided a simple test so that you see what exactly is expected
"""
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "data/example.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "St.": "Street",
            "Ave": "Avenue",
            "Rd.": "Road",
            "St.": "Street",
            }


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

In [86]:
res = audit(OSMFILE)
res

defaultdict(set,
            {'Ave': {'N. Lincoln Ave', 'North Lincoln Ave'},
             'Rd.': {'Baldwin Rd.'},
             'St.': {'West Lexington St.'}})

In [87]:
def update_name(name, mapping):
    street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
    m = street_type_re.search(name)
    if m:
        street_type = m.group()
        if street_type in mapping.keys():
            name = name[:m.start()] + mapping[street_type]
    return name

In [88]:
def test():
    st_types = audit(OSMFILE)
    assert len(st_types) == 3
    pprint.pprint(dict(st_types))

    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            print name, "=>", better_name
            if name == "West Lexington St.":
                assert better_name == "West Lexington Street"
            if name == "Baldwin Rd.":
                assert better_name == "Baldwin Road"


if __name__ == '__main__':
    test()

{'Ave': set(['N. Lincoln Ave', 'North Lincoln Ave']),
 'Rd.': set(['Baldwin Rd.']),
 'St.': set(['West Lexington St.'])}
N. Lincoln Ave => N. Lincoln Avenue
North Lincoln Ave => North Lincoln Avenue
West Lexington St. => West Lexington Street
Baldwin Rd. => Baldwin Road


# Quiz 5

In [181]:
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
"""
Your task is to wrangle the data and transform the shape of the data
into the model we mentioned earlier. The output should be a list of dictionaries
that look like this:

{
"id": "2406124091",
"type: "node",
"visible":"true",
"created": {
          "version":"2",
          "changeset":"17206049",
          "timestamp":"2013-08-03T16:43:42Z",
          "user":"linuxUser16",
          "uid":"1219059"
        },
"pos": [41.9757030, -87.6921867],
"address": {
          "housenumber": "5157",
          "postcode": "60625",
          "street": "North Lincoln Ave"
        },
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
}

You have to complete the function 'shape_element'.
We have provided a function that will parse the map file, and call the function with the element
as an argument. You should return a dictionary, containing the shaped data for that element.
We have also provided a way to save the data in a file, so that you could use
mongoimport later on to import the shaped data into MongoDB. 

Note that in this exercise we do not use the 'update street name' procedures
you worked on in the previous exercise. If you are using this code in your final
project, you are strongly encouraged to use the code from previous exercise to 
update the street names before you save them to JSON. 

In particular the following things should be done:
- you should process only 2 types of top level tags: "node" and "way"
- all attributes of "node" and "way" should be turned into regular key/value pairs, except:
    - attributes in the CREATED array should be added under a key "created"
    - attributes for latitude and longitude should be added to a "pos" array,
      for use in geospacial indexing. Make sure the values inside "pos" array are floats
      and not strings. 
- if the second level tag "k" value contains problematic characters, it should be ignored
- if the second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
- if the second level tag "k" value does not start with "addr:", but contains ":", you can
  process it in a way that you feel is best. For example, you might split it into a two-level
  dictionary like with "addr:", or otherwise convert the ":" to create a valid key.
- if there is a second ":" that separates the type/direction of a street,
  the tag should be ignored, for example:

<tag k="addr:housenumber" v="5158"/>
<tag k="addr:street" v="North Lincoln Avenue"/>
<tag k="addr:street:name" v="Lincoln"/>
<tag k="addr:street:prefix" v="North"/>
<tag k="addr:street:type" v="Avenue"/>
<tag k="amenity" v="pharmacy"/>

  should be turned into:

{...
"address": {
    "housenumber": 5158,
    "street": "North Lincoln Avenue"
}
"amenity": "pharmacy",
...
}

- for "way" specifically:

  <nd ref="305896090"/>
  <nd ref="1719825889"/>

should be turned into
"node_refs": ["305896090", "1719825889"]
"""
pass

### Analysis

In [109]:
file_in = 'data/example.osm'
elements = list()
for idx, t in enumerate(ET.iterparse(file_in)):
    element = t[1]
    print(idx)
    print(element)
    print(element.attrib)
    print([c for c in element])
    elements.append(element)
    if idx >= 35:
        break

0
<Element 'bounds' at 0x10deb07e0>
{'minlat': '41.9704500', 'maxlon': '-87.6894800', 'minlon': '-87.6928300', 'maxlat': '41.9758200'}
[]
1
<Element 'node' at 0x10deb0de0>
{'changeset': '11129782', 'uid': '451048', 'timestamp': '2012-03-28T18:31:23Z', 'lon': '-87.6866303', 'visible': 'true', 'version': '7', 'user': 'bbmiller', 'lat': '41.9730791', 'id': '261114295'}
[]
2
<Element 'node' at 0x10deb0c30>
{'changeset': '8448766', 'uid': '451048', 'timestamp': '2011-06-15T17:04:54Z', 'lon': '-87.6878512', 'visible': 'true', 'version': '6', 'user': 'bbmiller', 'lat': '41.9730416', 'id': '261114296'}
[]
3
<Element 'node' at 0x10de9b1b0>
{'changeset': '8581395', 'uid': '451048', 'timestamp': '2011-06-29T14:14:14Z', 'lon': '-87.6939548', 'visible': 'true', 'version': '5', 'user': 'bbmiller', 'lat': '41.9729565', 'id': '261114299'}
[]
4
<Element 'node' at 0x10de9b210>
{'changeset': '8581395', 'uid': '451048', 'timestamp': '2011-06-29T14:14:14Z', 'lon': '-87.6976025', 'visible': 'true', 'version

In [110]:
e = elements[1]

In [111]:
e.attrib

{'changeset': '11129782',
 'id': '261114295',
 'lat': '41.9730791',
 'lon': '-87.6866303',
 'timestamp': '2012-03-28T18:31:23Z',
 'uid': '451048',
 'user': 'bbmiller',
 'version': '7',
 'visible': 'true'}

In [112]:
e.tag

'node'

In [124]:
e2 = elements[31]
children = [c for c in e2]
children

[<Element 'tag' at 0x18243b510>,
 <Element 'tag' at 0x18243b570>,
 <Element 'tag' at 0x18243b5a0>,
 <Element 'tag' at 0x18243b1b0>,
 <Element 'tag' at 0x18243b480>,
 <Element 'tag' at 0x18243b390>,
 <Element 'tag' at 0x18243b360>,
 <Element 'tag' at 0x18243b300>,
 <Element 'tag' at 0x18243b2d0>,
 <Element 'tag' at 0x18243b270>,
 <Element 'tag' at 0x176a81060>]

In [125]:
for c in children:
    print(c.attrib)

{'k': 'addr:city', 'v': 'Chicago'}
{'k': 'addr:housenumber', 'v': '5157'}
{'k': 'addr:postcode', 'v': '60625'}
{'k': 'addr:street', 'v': 'North Lincoln Ave'}
{'k': 'amenity', 'v': 'restaurant'}
{'k': 'cuisine', 'v': 'mexican'}
{'k': 'name', 'v': 'La Cabana De Don Luis'}
{'k': 'outdoor_seating', 'v': 'no'}
{'k': 'phone', 'v': '1 (773)-271-5176'}
{'k': 'smoking', 'v': 'no'}
{'k': 'takeaway', 'v': 'yes'}


In [115]:
e2.attrib

{'changeset': '17206049',
 'id': '2406124091',
 'lat': '41.9757030',
 'lon': '-87.6921867',
 'timestamp': '2013-08-03T16:43:42Z',
 'uid': '1219059',
 'user': 'linuxUser16',
 'version': '2',
 'visible': 'true'}

In [116]:
attribs = e2.attrib

In [182]:
e2.attrib

{'changeset': '17206049',
 'id': '2406124091',
 'lat': '41.9757030',
 'lon': '-87.6921867',
 'timestamp': '2013-08-03T16:43:42Z',
 'uid': '1219059',
 'user': 'linuxUser16',
 'version': '2',
 'visible': 'true'}

### Solution

In [157]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
address = re.compile(r'addr:.*')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

def process_tag(tag, node):
    k = tag.attrib['k']
    v = tag.attrib['v']
    
    if problemchars.search(k):
        return node
    if lower_colon.match(k):
        if address.match(k):
            if k.count(':') > 1:
                return node
            if 'address' not in node.keys():
                node['address'] = dict()
            node['address'][k.split(':')[1]] = v
        else:
            node[k.replace(':', '_c_')] = v
    elif lower.match(k):
        node[k] = v
    return node

In [174]:
def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way":
        node['type'] = element.tag
        attrib = element.attrib.copy()
        
        # Created info
        node['created'] = dict()
        for key in CREATED:
            node['created'][key] = attrib.pop(key)
        # Position
        if ('lon' in attrib.keys()) and ('lat' in attrib.keys()):
            node['pos'] = [float(attrib.pop('lat')), float(attrib.pop('lon'))]
        # Other info
        node.update(attrib)
        # Inner tags
        for child in element:
            if child.tag == 'tag':
                process_tag(child, node)
            elif child.tag == 'nd':
                if 'node_refs' not in node.keys():
                    node['node_refs'] = list()
                node['node_refs'].append(child.attrib['ref'])
            
        return node
    else:
        return None

In [175]:
shape_element(e2)

{'address': {'city': 'Chicago',
  'housenumber': '5157',
  'postcode': '60625',
  'street': 'North Lincoln Ave'},
 'amenity': 'restaurant',
 'created': {'changeset': '17206049',
  'timestamp': '2013-08-03T16:43:42Z',
  'uid': '1219059',
  'user': 'linuxUser16',
  'version': '2'},
 'cuisine': 'mexican',
 'id': '2406124091',
 'name': 'La Cabana De Don Luis',
 'outdoor_seating': 'no',
 'phone': '1 (773)-271-5176',
 'pos': [41.975703, -87.6921867],
 'smoking': 'no',
 'takeaway': 'yes',
 'type': 'node',
 'visible': 'true'}

In [183]:
def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

def test():
    # NOTE: if you are running this code on your computer, with a larger dataset, 
    # call the process_map procedure with pretty=False. The pretty=True option adds 
    # additional spaces to the output, making it significantly larger.
    data = process_map('data/example.osm', True)
    pprint.pprint(data)
    
    correct_first_elem = {
        "id": "261114295", 
        "visible": "true", 
        "type": "node", 
        "pos": [41.9730791, -87.6866303], 
        "created": {
            "changeset": "11129782", 
            "user": "bbmiller", 
            "version": "7", 
            "uid": "451048", 
            "timestamp": "2012-03-28T18:31:23Z"
        }
    }
    assert data[0] == correct_first_elem
    assert data[-1]["address"] == {
                                    "street": "West Lexington St.", 
                                    "housenumber": "1412"
                                      }
    assert data[-1]["node_refs"] == [ "2199822281", "2199822390",  "2199822392", "2199822369", 
                                    "2199822370", "2199822284", "2199822281"]

if __name__ == "__main__":
    test()

[{'created': {'changeset': '11129782',
              'timestamp': '2012-03-28T18:31:23Z',
              'uid': '451048',
              'user': 'bbmiller',
              'version': '7'},
  'id': '261114295',
  'pos': [41.9730791, -87.6866303],
  'type': 'node',
  'visible': 'true'},
 {'created': {'changeset': '8448766',
              'timestamp': '2011-06-15T17:04:54Z',
              'uid': '451048',
              'user': 'bbmiller',
              'version': '6'},
  'id': '261114296',
  'pos': [41.9730416, -87.6878512],
  'type': 'node',
  'visible': 'true'},
 {'created': {'changeset': '8581395',
              'timestamp': '2011-06-29T14:14:14Z',
              'uid': '451048',
              'user': 'bbmiller',
              'version': '5'},
  'id': '261114299',
  'pos': [41.9729565, -87.6939548],
  'type': 'node',
  'visible': 'true'},
 {'created': {'changeset': '8581395',
              'timestamp': '2011-06-29T14:14:14Z',
              'uid': '451048',
              'user': 'bbmiller',

Looks good to me