# Wrangle OpenStreetMap Data





## Code from Data Wrangling with MongoDB - Lesson 6

In [4]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import codecs
import json
from pprint import pprint

## Count tags

The following routine counts the tags for each

In [6]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
street_type_portuguese_re = re.compile(r'^\b\S+\.?', re.IGNORECASE)

def audit_street_type(street_types, street_name, expected):
    m = street_type_portuguese_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile, expected):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'], expected)

    return street_types


def update_name(name, mapping):
    m = street_type_portuguese_re.search(name)
    if m:
        street_type = m.group()
        updated_street_type = mapping.get(street_type)
        if updated_street_type:
            name = name.replace(street_type, updated_street_type)
    return name

In [8]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way" :
        # YOUR CODE HERE
        node['type'] = element.tag
        node.update(element.attrib)
        latitude = node.pop('lat', None)
        longitude = node.pop('lon', None)
        if latitude and longitude:
            node['pos'] = [float(latitude), float(longitude)]
        node['created'] = {}
        for created_attr in CREATED:
            created_attr_value = node.pop(created_attr, None )
            if created_attr_value:
                node['created'][created_attr] = created_attr_value
        
        for tag in  element.iter('tag'):
            k = tag.attrib['k']
            if problemchars.findall(k):
                pass
            elif k[:5] == 'addr:':
                address_key = k[5:]
                if lower_colon.findall(address_key):
                    pass
                else:
                    if 'address' not in node.keys():
                        node['address'] = {}
                    value = tag.attrib.get('v')
                    if address_key =='street':
                        value = update_name(value, mapping)
                    node['address'][address_key] = value
            else:
                node[k] = tag.attrib.get('v')
        node_refs = [tag.get('ref') for tag in element.iter('nd')]
        if node_refs:
            node['node_refs'] = node_refs
        
        return node
    else:
        return None


def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
                print "printed "
    return data

## Overview of the Data

In this project I have used the metro area of Porto, Portugal. If you want to run this code, please download the OSM XML data from [Map Zen](https://mapzen.com/data/metro-extracts) and extract it in the directory of this Notebook.

The direct link is the following:
https://s3.amazonaws.com/metro-extracts.mapzen.com/porto_portugal.osm.bz2

In [10]:
path = 'porto_portugal.osm'

#### Size of the file

In [11]:
import os
print "Dataset size is %.1fMB"%(os.path.getsize(path)/1000000.)

Dataset size is 150.6MB


#### Number of unique users

The following users were found in our data:

In [18]:
def get_user(element):
    return element.attrib.get('user')

users = set()
for _, element in ET.iterparse(path):
    user = get_user(element)
    if user:
        users.add(user)

In [24]:
print "----------------------"
print "    First 20 users"
print "----------------------"
for u in list(users)[:20]:
    print u
print '...'
print "----------------------"
print "Total number of unique users: %d"%len(users) 

----------------------
    First 20 users
----------------------
Paulo Salvador
RuiPinto_12
fribeiro
nscerqueira
jfig
Marco Vergueira
paulamir
alvieboy
elsevilla
OSMF Redaction Account
PortugalMaps
Bruno Loureiro
Ropino
AndrewBuck
xybot
meldias
Ruca x16
Lobinho
djpatricio
Ana Pereira
...
----------------------
Total number of unique users: 895


#### Number of different tags (including nodes and ways)

The following function is used to count all the tags that appear in our XML document.

In [5]:
def count_tags(filename):
        # YOUR CODE HERE
        tree = ET.parse(filename)
        root = tree.getroot()

        tags_count = {root.tag: 1}
        for child in root.findall('.//'):
            tag = child.tag
            tags_count[tag] = tags_count.get(tag, 0) + 1
        return tags_count

In [53]:
tags = count_tags(path)
%store tags

Stored 'tags' (dict)


The following tags and respective count can be found in our data:

In [65]:
pprint(tags)

{'bounds': 1,
 'member': 11831,
 'nd': 885620,
 'node': 677414,
 'osm': 1,
 'relation': 1146,
 'tag': 293128,
 'way': 103625}


## Problems encountered in the map


The first thing a noticed in the data is that street names are written in Portuguese, which means that the conventional parsing techniques for English streets does not apply.
In Portuguese the following guidelines may help to define the first version of parser for street addresses:

- Street types appear in the first word of the address
- Street types are commonly Rua, Avenida, Estrada, among others.

Using the method _audit_ I will try to find out other Portuguese street types.
This method basically collects all the street addresses that are not recognized by the parser, grouping them per inferred street type. It infers the street type by extracting the first word of the address.

In [66]:
audit(path)

defaultdict(set,
            {'25': {'25 Abril'},
             'Alameda': {u'Alameda Bas\xedlio Teles',
              'Alameda Futebol Clube de Infesta',
              u'Alameda de S\xe3o Silvestre'},
             u'Av': {u'Av Lu\xeds de Cam\xf5es'},
             'Av.': {'Av. 24',
              u'Av. Men\xe9res',
              'Av. Pedra Verde',
              'Av. Principal',
              'Av. da Pedra Verde'},
             'Brito': {'Brito Capelo'},
             'Cais': {'Cais das Lavandeiras'},
             'Calcada': {'Calcada da Feira dos Dez'},
             u'Cal\xe7ada': {u'Cal\xe7ada da Cabine',
              u'Cal\xe7ada da Junqueira',
              u'Cal\xe7ada da Serra',
              u'Cal\xe7ada de Fontela',
              u'Cal\xe7ada de Valinhos',
              u'Cal\xe7ada do Arco'},
             'Caminho': {'Caminho de Vilar'},
             u'Campo': {u'Campo dos M\xe1rtires da P\xe1tria'},
             'Ciclovia': {'Ciclovia da Foz'},
             u'Costa': {u'Costa Pa

We can see that there were many street types that were not being included.
Let's add it.

In [83]:
expected = ["Rua", "Avenida", "Estrada", "Travessa", "Viela", "Zona", "Praceta", "Praça", "Calçada", "Largo", "Lugar", "Campo", "Ciclovia", "Caminho", "Via"]
audit(path, expected)



defaultdict(set,
            {'25': {'25 Abril'},
             'Alameda': {u'Alameda Bas\xedlio Teles',
              'Alameda Futebol Clube de Infesta',
              u'Alameda de S\xe3o Silvestre'},
             u'Av': {u'Av Lu\xeds de Cam\xf5es'},
             'Av.': {'Av. 24',
              u'Av. Men\xe9res',
              'Av. Pedra Verde',
              'Av. Principal',
              'Av. da Pedra Verde'},
             'Brito': {'Brito Capelo'},
             'Cais': {'Cais das Lavandeiras'},
             'Calcada': {'Calcada da Feira dos Dez'},
             u'Cal\xe7ada': {u'Cal\xe7ada da Cabine',
              u'Cal\xe7ada da Junqueira',
              u'Cal\xe7ada da Serra',
              u'Cal\xe7ada de Fontela',
              u'Cal\xe7ada de Valinhos',
              u'Cal\xe7ada do Arco'},
             u'Costa': {u'Costa Padr\xe3o'},
             'EN': {'EN 204/5'},
             'Esplanada': {'Esplanada do Rio de Janeiro'},
             'Ladeira': {'Ladeira da Quinta Nova'},
 

Another issue is that some street types are written using abreviations. E.g., Sometimes _Avenida_ appears as _Av._.
The method _update_name_ is going to be used to update these



In [97]:
mapping = {
    "R": "Rua",
    "R.": "Rua",
    "Praca": "Praça",
    "Av.": "Avenida",
    "Calcada": "Calçada"
}
name = 'R. da Liberdade'
print '%s => %s'%(name, update_name(name, mapping))

R. da Liberdade => Rua da Liberdade


Portuguese has some latin characters that can be a trouble, such as ç in Praça.
Instead, we read Pra\xe7a, which does not make sense.
Although these weird characters might make sense with different encoding, we will also convert them to something Python can recognize.

Let's do this by updating *update_name* method and _mapping_ dictionary.

In [130]:
def update_name(name, mapping):
    m = street_type_portuguese_re.search(name)
    if m:
        street_type = m.group()
        updated_street_type = mapping.get(street_type)
        if updated_street_type:
            name = name.encode('utf-8').replace(street_type.encode('utf-8'), updated_street_type)
    return name



In [136]:
mapping = {
    "R": "Rua",
    "R.": "Rua",
    "Praca": "Praça",
    "Av.": "Avenida",
    "Calcada": "Calçada",
    u'Urbaniza\xe7\xe3o': 'Urbanização',
    u'Pra\xe7a': 'Praça',
}
name = u'Pra\xe7a da Liberdade'
updated_name = update_name(name, mapping)
print '%s => %s'%(name.encode('utf-8'), updated_name)

Praça da Liberdade => Praça da Liberdade


# City Names

Sometimes the name of cities can also be an issue. Some users may use acronyms, others might suppress some propositions when present in the name.

Let's make an audit for this. I'll start with counting the number of distinct cities.


In [33]:
def count_cities(filename):
        # YOUR CODE HERE
        tree = ET.parse(filename)
        root = tree.getroot()

        cities_count = {}
        for child in root.findall('.//tag'):
            if child.attrib['k'] == 'addr:city':
                city = child.attrib['v'].lower()
                cities_count[city] = cities_count.get(city, 0) + 1
        return cities_count

In [30]:
cities = count_cities(path)

### Distinct citiy names

In [34]:
len(cities)

77

We have 77 different city names.
According to Wikipedia (https://en.wikipedia.org/wiki/Metropolitan_Area_of_Porto#Population), Porto metropolitan area has only 17 cities:

- Santo Tirso
- Trofa
- Arouca
- Oliveira de Azeméis
- Santa Maria da Feira	
- São João da Madeira
- Vale de Cambra	
- Espinho	
- Gondomar
- Maia
- Matosinhos
- Porto
- Póvoa de Varzim
- Valongo	
- Vila do Conde
- Vila Nova de Gaia
- Paredes

In [31]:
pprint(cities)

{'alfena': 3,
 'arcozelo, vila nova de gaia': 1,
 'argoncilhe': 7,
 'baguim do monte': 1,
 'baltar': 2,
 'bonfim': 1,
 'branzelo': 1,
 u'calend\xe1rio': 74,
 u'campanh\xe3': 1,
 'canelas': 1,
 'canelas vng': 1,
 'canidelo': 1,
 'canidelo - v. n. gaia': 1,
 'canidelo - vila nova de gaia': 1,
 u'cast\xealo da maia': 1,
 u'cust\xf3ias': 5,
 u'cust\xf3ias mts': 1,
 'ermesinde': 6,
 u'esmeriz -vila nova de famalic\xe3o': 1,
 'espinho': 7,
 'fajozes': 3,
 u'fi\xe3es': 10,
 'gandra': 2,
 'gens': 1,
 'gondomar': 4,
 u'gondomar (s\xe3o cosme)': 9,
 u'grij\xf3': 8,
 u'guid\xf5es': 1,
 'lagares': 2,
 'lagoa': 3,
 'lamelas': 8,
 u'le\xe7a da palmeira': 4,
 u'le\xe7a do balio': 3,
 'lisboa': 3,
 u'lob\xe3o': 1,
 'lourosa': 24,
 'lousado': 1,
 'macieira da maia': 2,
 'madalena': 1,
 'mafamude': 1,
 'maia': 847,
 'matosinhos': 8,
 u'milheir\xf3s': 3,
 'mindelo': 2,
 'modelos': 1,
 'moreira da maia': 3,
 u'mosteir\xf3': 7,
 'nogueira da regedoura': 10,
 'paranhos': 1,
 'paredes': 1,
 u'pa\xe7os de fer

From the list above we can explain this large number of cities by the fact that many users have given the name of civil parish.

Sometimes both the civil parish and the city are written (e.g., 'canidelo - vila nova de gaia' where _Vila Nova de Gaia_ is the city) and other times only the parish is given (e.g., canidelo).

In addition, cities with long names also appear written with acronyms. E.g., _Vila Nova de Gaia_ appears once as _V. N. Gaia_.

#### Solving the issue automatically

In [81]:
parish_city_mapping = {
    'canidelo' : 'vila nova de gaia',
    'canelas' : 'vila nova de gaia',
    'mafamude': 'vila nova de gaia',
    'baltar': 'paredes',
    'ermesinde': 'valongo',
    u'cust\xf3ias': 'matosinhos',
    u'cast\xealo da maia': 'matosinhos',
    u'le\xe7a da palmeira': 'matosinhos',
    'mindelo': 'vila do conde',
    'bonfim': 'porto',
    'paranhos': 'porto',
    'ramalde': 'porto',
    u'campanh\xe3':'porto',
    'senhora da hora': 'matosinhos',
    'valadares': 'vila nova de gaia',
    u'\xe1guas santas': 'maia',
    'lousado': u'vila nova de famalic\xe3o',
    u'cast\xealo da maia': 'matosinhos',
    u'le\xe7a da palmeira': 'matosinhos',
    'alfena': 'valongo',
    'roriz': 'santo tirso',
    'baguim do monte': 'gondomar',
    'fajozes': 'vila do conde',
}

city_names = [
    'vila nova de gaia',
    'santo tirso',
    'trofa',
    'arouca',
    'oliveira de azeméis',
    'santa maria da feira',
    'são joão da madeira',
    'vale de cambra',
    'espinho',
    'gondomar',
    'maia',
    'matosinhos',
    'porto',
    'póvoa de varzim',
    'valongo',
    'vila do conde',
    'vila nova de gaia',
    'paredes',

]

In [82]:
def update_city_name(name, mapping, city_names):
    name = name.lower().encode('utf-8')
    updated_name = mapping.get(name)
    if updated_name:
        return updated_name
    if name not in city_names:
        for city in city_names:
            if city in name:
                return city
    return name.decode('utf-8')
    

In [83]:
updated_cities = {}
for (city, count) in cities.iteritems():
    updated_city = update_city_name(city, parish_city_mapping, city_names)
    updated_cities[updated_city] = (updated_cities.get(updated_city) or 0) + count
print len(updated_cities)
pprint(updated_cities)

58
{u'argoncilhe': 7,
 u'baguim do monte': 1,
 u'bonfim': 1,
 u'branzelo': 1,
 u'calend\xe1rio': 74,
 u'campanh\xe3': 1,
 u'canelas vng': 1,
 u'canidelo - v. n. gaia': 1,
 u'cust\xf3ias': 5,
 u'cust\xf3ias mts': 1,
 u'esmeriz -vila nova de famalic\xe3o': 1,
 u'espinho': 7,
 u'fajozes': 3,
 u'fi\xe3es': 10,
 u'gandra': 2,
 u'gens': 1,
 'gondomar': 13,
 u'grij\xf3': 8,
 u'guid\xf5es': 1,
 u'lagares': 2,
 u'lagoa': 3,
 u'lamelas': 8,
 u'le\xe7a da palmeira': 4,
 u'le\xe7a do balio': 3,
 u'lisboa': 3,
 u'lob\xe3o': 1,
 u'lourosa': 24,
 u'madalena': 1,
 'maia': 853,
 'matosinhos': 10,
 u'milheir\xf3s': 3,
 u'modelos': 1,
 u'mosteir\xf3': 7,
 u'nogueira da regedoura': 10,
 'paredes': 4,
 u'pa\xe7os de ferreira': 7,
 u'pa\xe7os de ferriera': 1,
 u'pedrou\xe7os': 1,
 u'penafiel': 1,
 'porto': 243,
 u'povoa de varzim': 1,
 u'p\xf3voa de varzim': 1,
 u'ramalde': 6,
 u'ribeir\xe3o': 6,
 u'rio tinto': 5,
 u's. martinho do campo': 1,
 u's.pedro de formariz': 1,
 u'sandim': 1,
 u'santo tirso': 14,
 

## Other ideas about the datasets

One thing that could be interesting would be getting a ranking of the most active contributors in the data.
This could be achieved by making a few changes in the method that retrieves unique users. The collections.Counter class would help in the implementation (see https://docs.python.org/dev/library/collections.html#collections.Counter).


If this dataset was frequently, another that could be done would be getting the areas that have been recently updated -- in the last day/week/month.

This could be done by filtering nodes using the attribute **timestamp** and according to the intended time window.

## Convert data to json

The following method *process_map* processes data with the transformations described above and stores data into a json file with the name "porto_portugal.osm.json".

In [42]:
json_data = process_map("porto_portugal.osm")

[{'created': {'changeset': '12009377',
   'timestamp': '2012-06-25T10:15:29Z',
   'uid': '602999',
   'user': 'rtafav2',
   'version': '5'},
  'id': '24960108',
  'pos': [41.23288, -8.6772787],
  'type': 'node'},
 {'created': {'changeset': '12017708',
   'timestamp': '2012-06-25T23:55:14Z',
   'uid': '602999',
   'user': 'rtafav2',
   'version': '2'},
  'id': '24960109',
  'pos': [41.2628808, -8.6851482],
  'type': 'node'},
 {'created': {'changeset': '10218263',
   'timestamp': '2011-12-27T18:09:16Z',
   'uid': '291378',
   'user': 'Micael Dias',
   'version': '5'},
  'id': '25440124',
  'pos': [41.1443339, -8.5873853],
  'type': 'node'},
 {'created': {'changeset': '8956209',
   'timestamp': '2011-08-08T11:43:29Z',
   'uid': '499804',
   'user': 'rtafav',
   'version': '3'},
  'id': '25440125',
  'pos': [41.1446759, -8.5868845],
  'type': 'node'},
 {'created': {'changeset': '8956209',
   'timestamp': '2011-08-08T11:43:29Z',
   'uid': '499804',
   'user': 'rtafav',
   'version': '2'},
 