## Example final report to model this script off of

https://gist.github.com/carlward/54ec1c91b62a5f911c42#file-sample_project-md

In [1]:
import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

filename = ('example.osm')

tree = ET.parse(filename)
print(tree)
root = tree.getroot()
print(root)

<xml.etree.ElementTree.ElementTree object at 0x1119e36a0>
<Element 'osm' at 0x1119bf7c8>


In [2]:
def count_tags(filename):
    list = []
    dict = {}
    for event, elem in ET.iterparse(filename):
        list.append(elem.tag)
    for e in list:
        if e in dict:
            dict[e] += 1
        else:
            dict[e] = 1
    return dict

tags = count_tags(filename)
pprint.pprint(tags)

{'bounds': 1,
 'member': 3,
 'nd': 11,
 'node': 23,
 'osm': 1,
 'relation': 1,
 'tag': 42,
 'way': 2}


## Tag Type Audit

In [3]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

def key_type(element, keys):
    if element.tag == "tag":
        # print element.attrib["k"]
        k = element.attrib['k']
        if lower.match(k):
            keys['lower'] += 1
        elif lower_colon.match(k): 
            keys['lower_colon'] += 1
        elif problemchars.match(k):
            keys['problemchars'] += 1
        else: 
            keys['other'] += 1
    return keys

def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for event, element in ET.iterparse(filename):
        keys = key_type(element, keys)
    return keys

pprint.pprint(process_map(filename))


{'lower': 21, 'lower_colon': 18, 'other': 3, 'problemchars': 0}


## Unique User Audit

In [4]:
def get_user(element):
    if element.get('uid'):
        return element.get('uid')

def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if get_user(element):
            users.add(get_user(element))
    return users

pprint.pprint(process_map(filename))

{'451048', '939355', '26299', '674454', '1219059', '634589', '567034', '147510'}


## Improving street names

In [5]:
from collections import defaultdict

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "St.": "Street",
            "Ave": "Avenue",
            "Rd.": "Road"
            }


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types


def update_name(name, mapping):
    find_street = street_type_re.search(name)
    # print m.group()
    if find_street.group() in mapping.keys():
        name = re.sub(find_street.group(), mapping[find_street.group()], name)
    # Alternate solution below using lambda (anonymous function)
    # result = street_type_re.sub(lambda x: mapping[x.group()], name)
    return name

pprint.pprint(dict(audit(filename)))

{'Ave': {'North Lincoln Ave', 'N. Lincoln Ave'},
 'Rd.': {'Baldwin Rd.'},
 'St.': {'West Lexington St.'}}


## Trying to use update name with audit definition

In [6]:
st_types = audit(filename)
for st_type, ways in st_types.items():
        for name in ways:
            better_name = update_name(name, mapping)
            
pprint.pprint(better_name)

'West Lexington Street'


# Big Code from CaseStudy Ready to Put Into CSVs and then Database

In [7]:
# import schema

# OSM_PATH = "example.osm"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

# SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

def get_element(filename, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    if element.tag == 'node':
        for attr in element.attrib:
            #If attributes are in FIELDS
            if attr in node_attr_fields:
                node_attribs[attr] = element.attrib[attr]
        for child in element:
            if child.tag == 'tag':
                tag = {}
                if problem_chars.search(child.attrib["k"]):
                    pass
                else:
                    for i in NODE_TAGS_FIELDS:
                        if i=='id':
                            tag['id'] = node_attribs['id']
    
                        elif i=='key':
                            if len(child.attrib['k'].split(":"))==1:
                                tag['key'] = child.attrib['k']
    
                            elif len(child.attrib['k'].split(":"))==2:
                                tag['key'] = child.attrib['k'].split(":")[1]
    
                            elif len(child.attrib['k'].split(":"))==3:
                                tag['key']=child.attrib['k'].split(":")[1]+":"+child.attrib['k'].split(":")[2]
    
                        elif i == 'value':
                            tag[i] = child.attrib['v']
    
                        elif i == 'type':
                            if len(child.attrib['k'].split(":"))>=2:
                                tag['type']=child.attrib['k'].split(":")[0]
                            else:
                                tag['type'] = default_tag_type
                tags.append(tag)
            
    elif element.tag == 'way':
        for attr in element.attrib:
            #If attributes are in FIELDS
            if attr in way_attr_fields:
                    way_attribs[attr] = element.attrib[attr]
        count = -1
        for child in element:
            if child.tag == 'tag':
                tag_way = {}
                if problem_chars.search(child.attrib["k"]):
                    pass
                else: 
                    for i in WAY_TAGS_FIELDS:
                        if i=='id':
                            tag_way[i] = way_attribs['id']
    
                        elif i=='key':
                            if len(child.attrib['k'].split(":"))==1:
                                tag_way[i] = child.attrib['k'].split(":")[0]
    
                            elif len(child.attrib['k'].split(":"))==2:
                                tag_way[i] = child.attrib['k'].split(":")[1]
    
                            elif len(child.attrib['k'].split(":"))==3:
                                tag_way[i]=child.attrib['k'].split(":")[1]+":"+child.attrib['k'].split(":")[2]
    
                        elif i == 'value':
                            tag_way[i] = child.attrib['v']
    
                        elif i == 'type':
                            if len(child.attrib['k'].split(":"))>=2:
                                tag_way[i]=child.attrib['k'].split(":")[0]
                            else:
                                tag_way[i] = default_tag_type
                tags.append(tag_way)
            
            elif child.tag == 'nd':
                tag = {}
                count += 1
                for i in WAY_NODES_FIELDS:
                    if i =='id':
                        tag['id'] = way_attribs['id']
                    elif i == 'node_id':
                        tag['node_id'] = child.attrib['ref'] 
                    elif i == 'position':
                        # count += 1
                        tag['position'] = count
                way_nodes.append(tag)

    if element.tag == 'node':
#         print {'node': node_attribs, 'node_tags': tags}
        return {'node': node_attribs, 'node_tags': tags}
    elif element.tag == 'way':
#         print {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}

element = get_element(filename, tags=('node', 'way', 'relation'))
pprint.pprint(shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,\
                            problem_chars=PROBLEMCHARS, default_tag_type='regular'))

AttributeError: 'generator' object has no attribute 'tag'

## Put this data into a CSV or Find out if you can upload to .DB with XML data

## Use to create database

https://www.w3schools.com/sql/sql_create_db.asp

https://gist.github.com/swwelch/f1144229848b407e0a5d13fcb7fbbd6f