# Create MongoDB JSON File From XML Import File

Step 2 is to convert the pre-import XML file to JSON so that we can import to MongoDB.

In [1]:
# Filtered file
# This file is a subset of the original file with certain rows removed due to our lack of interest
FILTERED_FILE = "/Users/markmavromatis/Downloads/san-francisco_california_filtered.osm"

ERRORS_FILE = "/Users/markmavromatis/Downloads/san-francisco_json_creation_errors.txt"
OUT_JSON_FILE = "/Users/markmavromatis/Downloads/san-francisco_california_filtered.json"

In [2]:
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
from collections import defaultdict

# Now convert the data to JSON

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

errors_file = open(ERRORS_FILE, "w")
errors_file.write("ID|ERRORTYPE|MESSAGE\n")




def shape_element(element):
    
    node = {}
    if element.tag == "node" or element.tag == "way" :
        node['type'] = element.tag
        node['id'] = element.attrib['id']
        node['created'] = {}
        
        for each_attribute in CREATED:
            if each_attribute in element.attrib:
                node['created'][each_attribute] = element.attrib[each_attribute]
            else:
                errors_file.write("{}|MISSING_ATTRIBUTE|Missing created attribute: {}\n".format(node['id'], each_attribute))
        if 'lat' in element.attrib and 'lon' in element.attrib:
            node['pos'] = []
            node['pos'].append(float(element.attrib['lat']))
            node['pos'].append(float(element.attrib['lon']))

        if element.tag == 'way':
            for child_tag in element.iter('nd'):
                a_key = "node_refs"
                a_value = child_tag.attrib['ref']
                if a_key not in node:
                    node[a_key] = []
                node[a_key].append(a_value)

        for child_tag in element.iter("tag"):
            a_key = child_tag.attrib['k']
            a_value = child_tag.attrib['v']

            # Only process keys with non-problematic characters
            problem_characters = re.search(problemchars, a_key)
            if problem_characters is not None:
                # Handle problem characters here
                errors_file.write("{}|NODEKEY|Problem characters found in key: {}\n".format(node['id'], a_key))
            else:
                depth = 0
                insertion_point = node
                
                # If there are colons in the key, then create dictionaries for each "level" (colon)
                colon_index = a_key.find(":")
                while colon_index > 0:
                    dict_name = ""
                    if depth == 0:
                        if a_key[0:colon_index] == 'addr':
                            # set dictionary name to address for addr: entries
                            dict_name = 'address'
                        else:
                            # Add underscore after prefix to prevent clashes with other tags
                            dict_name = a_key[0:colon_index] + "_"
    
                    else:
                        # Dictionary name = pre-colon key name
                        dict_name = a_key[0:colon_index] + "_"

                    # Only create new dictionary if none already exists
                    if dict_name not in insertion_point:
                        insertion_point[dict_name] = {}

                    insertion_point = insertion_point[dict_name]


                    # Remove new dictionary item from key name and iterate
                    a_key = a_key[colon_index + 1:]

                    colon_index = a_key.find(":") 
                    depth += 1

                insertion_point[a_key] = a_value
        return node
    else:
        return None


def process_map(file_in, pretty):

    with codecs.open(OUT_JSON_FILE, "w") as fo:
        row_number = 0
        for _, element in ET.iterparse(file_in, events=('start',)):
            row_number += 1            
            el = shape_element(element)

            if el:
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")

                
    print("Created output file: {}".format(OUT_JSON_FILE))


process_map(FILTERED_FILE, True)
 

Created output file: /Users/markmavromatis/Downloads/san-francisco_california_filtered.json
