### 3.a. - XML to JSON - OpenStreetMap


Reference:

https://github.com/bestkao/data-wrangling-with-openstreetmap-and-mongodb/blob/master/data-wrangling-with-openstreetmaps.ipynb

In [1]:
import xml.etree.ElementTree as ET
import pprint

from datetime import datetime

CREATED = ["version", "changeset", "timestamp", "user", "uid"]

def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def shape_element(element):
    node = {}    
    if element.tag == "node" or element.tag == "way" :
        node['type'] = element.tag
        
        # Parse attributes
        for attrib in element.attrib:

            # Data creation details
            if attrib in CREATED:
                if 'created' not in node:
                    node['created'] = {}
                if attrib == 'timestamp':
                    node['created'][attrib] = datetime.strptime(element.attrib[attrib], '%Y-%m-%dT%H:%M:%SZ')
                else:
                    node['created'][attrib] = element.get(attrib)

            # Parse location
            if attrib in ['lat', 'lon']:
                lat = float(element.attrib.get('lat'))
                lon = float(element.attrib.get('lon'))
                node['pos'] = [lat, lon]

            # Parse the rest of attributes
            else:
                node[attrib] = element.attrib.get(attrib)
            
        # Process tags
        for tag in element.iter('tag'):
            key   = tag.attrib['k']
            value = tag.attrib['v']
            #if not problemchars.search(key): 
            
            # Tags with single colon and beginning with addr
            if key.find('addr') == 0:
                if 'address' not in node:
                    node['address'] = {}
                sub_attr = key.split(':')[1]
                node['address'][sub_attr] = value

            # All other tags that don't begin with "addr"
            elif not key.find('addr') == 0:
                if key not in node:
                    node[key] = value
            else:
                node["tag:" + key] = value
        
        # Process nodes
        for nd in element.iter('nd'):
            if 'node_refs' not in node:
                node['node_refs'] = []
            node['node_refs'].append(nd.attrib['ref'])

        return node
    else:
        return None

In [2]:
import json
from bson import json_util

def process_map(file_in, pretty = False):
    file_out = "{0}.json".format(file_in)  
    with open(file_out, "w") as fo:
    # with open(file_out, "wb") as fo: # With error: TypeError: a bytes-like object is required, not 'str'
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                if pretty:
                    fo.write(json.dumps(el, indent=2, default=json_util.default)+"\n")
                else:
                    fo.write(json.dumps(el, default=json_util.default) + "\n")

process_map("dados/output_street.osm")
