## Sample

In [18]:
import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "state-college-pa.osm"  # Replace this with your osm file
SAMPLE_FILE = "pa.osm"

k = 10000 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')



## Preparing for Database

In [1]:
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
multiple_colons = re.compile(r':{2,}')


CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


#The shape_element function runs on each element
def shape_element(element):
    node = {}
    
    if element.tag == "node" or element.tag == "way" :
        # created dictionary for creation info
        created = {}
        # pos list for lat and lon
        pos = []
        node['type'] = element.tag

        # loop through each element attribute
        for a in element.attrib.keys():
            # if key in CREATED list, add key:value to created dictionary
            if a in CREATED: 
                created[a] = element.attrib[a]
                # if created dictionary created, add to node dictionary
                if created: 
                    node['created'] = created
            # if key is lat or lon, add to pos list, then add list to node dict
            elif a == 'lat':
                pos.insert(0,element.get('lat'))
            elif a == 'lon':
                pos.insert(0,element.get('lon'))
                node['pos'] = pos
            # otherwise, add key:value pair of attribute to node dictionary
            else:
                node[a] = element.get(a)
                
        address = {}   
        for subtag in element:
            if subtag.get('k'):
                
                # if tag has two or more colons, ignore
                if re.search(r':.*:', subtag.get('k')):
                    continue
               
                #if tag has problem characters, ignore
                elif problemchars.search(subtag.get('k')):
                    continue
                
                # if tag starts with addr:, add to dictionary "address"
                elif subtag.get('k').startswith('addr:'):
                    address[subtag.get('k')[5:]] = subtag.get('v')
                    if address:
                        node['address'] = address
                else:
                    node[subtag.get('k')] = subtag.get('v')
        element.clear()
        pprint.pprint(node)
    
        
        return node

    else:
        return None
    



#don't edit
def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
        element.clear()
        
    pprint.pprint(data)
    return data

In [2]:
process_map('pa.osm', True)

{'created': {'changeset': '2405936',
             'timestamp': '2009-09-07T19:29:11Z',
             'uid': '105255',
             'user': 'Sven L',
             'version': '2'},
 'id': '26095830',
 'pos': ['40.5729814', '-77.5836939'],
 'type': 'node'}
{'created': {'changeset': '3271960',
             'timestamp': '2009-12-02T13:29:41Z',
             'uid': '147510',
             'user': 'woodpeck_fixbot',
             'version': '2'},
 'id': '105977898',
 'pos': ['40.892481', '-78.208992'],
 'type': 'node'}
{'created': {'changeset': '3271960',
             'timestamp': '2009-12-02T13:36:42Z',
             'uid': '147510',
             'user': 'woodpeck_fixbot',
             'version': '2'},
 'id': '106012560',
 'pos': ['40.85055', '-77.498074'],
 'type': 'node'}
{'created': {'changeset': '3174145',
             'timestamp': '2009-11-21T09:12:56Z',
             'uid': '147510',
             'user': 'woodpeck_fixbot',
             'version': '2'},
 'id': '106043980',
 'pos': ['40.902214

[{'created': {'changeset': '2405936',
   'timestamp': '2009-09-07T19:29:11Z',
   'uid': '105255',
   'user': 'Sven L',
   'version': '2'},
  'id': '26095830',
  'pos': ['40.5729814', '-77.5836939'],
  'type': 'node'},
 {'created': {'changeset': '3271960',
   'timestamp': '2009-12-02T13:29:41Z',
   'uid': '147510',
   'user': 'woodpeck_fixbot',
   'version': '2'},
  'id': '105977898',
  'pos': ['40.892481', '-78.208992'],
  'type': 'node'},
 {'created': {'changeset': '3271960',
   'timestamp': '2009-12-02T13:36:42Z',
   'uid': '147510',
   'user': 'woodpeck_fixbot',
   'version': '2'},
  'id': '106012560',
  'pos': ['40.85055', '-77.498074'],
  'type': 'node'},
 {'created': {'changeset': '3174145',
   'timestamp': '2009-11-21T09:12:56Z',
   'uid': '147510',
   'user': 'woodpeck_fixbot',
   'version': '2'},
  'id': '106043980',
  'pos': ['40.902214', '-78.208035'],
  'type': 'node'},
 {'created': {'changeset': '3271960',
   'timestamp': '2009-12-02T13:48:48Z',
   'uid': '147510',
   'use

In [None]:
process_map('state-college-pa.osm', True)

{'created': {'changeset': '2405936',
             'timestamp': '2009-09-07T19:29:11Z',
             'uid': '105255',
             'user': 'Sven L',
             'version': '2'},
 'id': '26095830',
 'pos': ['40.5729814', '-77.5836939'],
 'type': 'node'}
{'created': {'changeset': '2405936',
             'timestamp': '2009-09-07T19:29:11Z',
             'uid': '105255',
             'user': 'Sven L',
             'version': '2'},
 'id': '26095831',
 'pos': ['40.5740064', '-77.5948271'],
 'type': 'node'}
{'created': {'changeset': '222489',
             'timestamp': '2007-02-23T01:19:51Z',
             'uid': '5473',
             'user': 'Adam Killian',
             'version': '1'},
 'created_by': 'JOSM',
 'id': '26095833',
 'pos': ['40.5788942', '-77.5987618'],
 'type': 'node'}
{'created': {'changeset': '2030462',
             'timestamp': '2009-08-03T22:12:18Z',
             'uid': '105255',
             'user': 'Sven L',
             'version': '2'},
 'id': '26095834',
 'pos': ['40.58766

## Import
1. In command prompt, cd to C:\Program Files\MongoDB\Server\3.2\bin.
2. mongoimport -d test -c balt --file C:\Users\Mister\Documents\data_analyst\P3_Data_Wrangling\dc-balt.osm.json

In [4]:
from pymongo import MongoClient

client = MongoClient()
db = client.test

## Number of documents

In [5]:
db.balt.find().count()

351325

## Size of database

In [6]:
db.balt.stats()

TypeError: 'Collection' object is not callable. If you meant to call the 'stats' method on a 'Collection' object it is failing because no such method exists.

In [7]:
db.sample.dataSize()

TypeError: 'Collection' object is not callable. If you meant to call the 'dataSize' method on a 'Collection' object it is failing because no such method exists.

## Number of nodes

In [8]:
db.balt.find({'type': 'node'}).count()

337837

## Number of ways

In [9]:
db.balt.find({'type': 'ways'}).count()

0

## Number of unique users

In [10]:
print db.balt.distinct("created.user")

[u'blackadder', u'asciiphil', u'woodpeck_fixbot', u'mdroads', u'ecaldwell', u'pnorman_mechanical', u'Cavit', u'RoadGeek_MD99', u'NE2', u'sadam', u'SammyB428', u'Jack Russell', u'aude', u'westendguy', u'Schierkolk', u'Your Village Maps', u'wonderchook', u'dchiles', u'JoshD', u'cbwalkup', u'mutantmonkey', u'Diego', u'PhilR8', u'ElliottPlack', u'RJCorazza', u'alongthepike', u'Burninate', u'Evanator', u'Bored', u'kriscarle', u'Will White', u'S_H', u'woodpeck_repair', u'campnatt', u'dwbond', u'jaakkoh', u'DavidYJackson_import', u'Jeffrey Hearn', u'EP_Import', u'mdroads_import', u'ingalls', u'gdoyle', u'TorCguy', u'Cheng Wang', u'ceyockey', u'mpetroff-imports', u'Steven Vance', u'Sarr_Cat', u'blacklocust', u'LaurenJay', u'richardsonp', u'wvdp', u'RoadWarrier', u'JHSpyHard', u'DaveHansenTiger', u'bot-mode', u'RG-MD99_import', u'geobrando_dcbuildingsaddresses', u'uboot', u'wambacher', u'slover98', u'shoe', u'karitotp', u'Sven L', u'Adam Killian', u'oldtopos', u'SK53', u'pkoby', u'HattoriHanzo'

In [11]:
print len(db.balt.distinct("created.user"))

265


http://stackoverflow.com/questions/30327508/mongodb-osm-street-maps-unique-users
distinct returns a list.  to get the length of a list in python, use length()

## Top 5 users with highest number of entries

In [12]:
import pprint
cursor = db.balt.aggregate([
        {'$group': {'_id': "$created.user", 'count': {'$sum': 1}}},
        {'$sort': {'count': -1}},
        {'$limit': 5}
    ])
for doc in cursor:
    pprint.pprint(doc)

{u'_id': u'woodpeck_fixbot', u'count': 215438}
{u'_id': u'Sven L', u'count': 79701}
{u'_id': u'TIGERcnl', u'count': 8136}
{u'_id': u'bot-mode', u'count': 6497}
{u'_id': u'DaveHansenTiger', u'count': 4864}


## Number and Types of Buildings

In [13]:
import pprint
cursor = db.balt.aggregate([
        {'$group': {'_id': "$building", 'count': {'$sum': 1}}},
        {'$sort': {'count': -1}}
    ])
for doc in cursor:
    pprint.pprint(doc)

{u'_id': None, u'count': 351283}
{u'_id': u'yes', u'count': 16}
{u'_id': u'entrance', u'count': 16}
{u'_id': u'residential', u'count': 5}
{u'_id': u'house', u'count': 2}
{u'_id': u'detached', u'count': 2}
{u'_id': u'ruins', u'count': 1}


## Postcode Check

In [14]:
print db.balt.distinct('address.postcode')

[u'21128', u'21222', u'21133', u'20020', u'20019', u'21093', u'21236', u'21204', u'21244', u'21218', u'21217', u'21227', u'21214', u'16803', u'16801-9998', u'16801', u'16801-4736', u'16801-3838', u'16801-4922', u'16801-2812', u'16801-4713', u'16802-2604', u'16801-4032', u'16801-2810', u'16801-3923', u'16801-3922', u'16801-3919', u'16823', u'16669', u'16870', u'16801-7307', u'17004', u'16828', u'17841', u'16804', u'16858']


## Type check

In [15]:
print db.balt.distinct('type')

[u'node', u'broad_leaved', u'way', u'Public']


In [16]:
cursor = db.sample.find({'type': 'broad_leaved'})
for doc in cursor:
    print doc

{u'natural': u'tree', u'created': {u'changeset': u'35623297', u'version': u'4', u'user': u'woodpeck_repair', u'timestamp': u'2015-11-28T07:12:55Z', u'uid': u'145231'}, u'lon': u'-77.4423822', u'pos': [39.0586388, -77.4423822], u'_id': ObjectId('578d159c87b6d73f4147c126'), u'type': u'broad_leaved', u'id': u'2405872915'}
{u'natural': u'tree', u'created': {u'changeset': u'35623297', u'version': u'4', u'user': u'woodpeck_repair', u'timestamp': u'2015-11-28T07:12:55Z', u'uid': u'145231'}, u'pos': [u'39.0586388', u'-77.4423822'], u'_id': ObjectId('579a696de300de16b3497a08'), u'type': u'broad_leaved', u'id': u'2405872915'}
{u'natural': u'tree', u'created': {u'changeset': u'35623297', u'version': u'4', u'user': u'woodpeck_repair', u'timestamp': u'2015-11-28T07:12:55Z', u'uid': u'145231'}, u'pos': [u'39.0586388', u'-77.4423822'], u'_id': ObjectId('579e691e757d8f6e3fbdd01b'), u'type': u'broad_leaved', u'id': u'2405872915'}
{u'natural': u'tree', u'created': {u'changeset': u'35623297', u'version':

## Street check

In [17]:
print db.balt.distinct('address.street')

[u'Anns Garden Way', u'Eastern Avenue', u'Oak Trace Way', u'Savannah Terrace Southeast', u'Grant Street Northeast', u'Fountain Hill Drive', u'Fox Brier Lane', u'West Joppa Road', u'Belclare Road', u'Fairbrook Road', u'Cator Avenue', u'Harlem Avenue', u'Benson Avenue', u'Glenoak Avenue', u'School Drive', u'S Fraser St', u'South Allen Street', u'South Pugh Street', u'West College Avenue', u'East College Avenue', u'East Beaver Avenue', u'East Calder Way', u'West Beaver Avenue', u'Food Science Building', u'Transfer Road', u'Premiere Dr', u'South Atherton Street', u'South Buckhout Street', u'N Patterson St', u'Hiester St', u'S Hiester St', u'S Garner St', u'S Sparks St', u'North Atherton Street', u'McAllister Aly', u"McAlevy's Fort Rd", u'Montauk Circle', u'South Burrowes Street', u'South Fraser Street', u'West Calder Way', u'Miller Alley', u'Northland Center', u'Benner Pike', u'E Main St', u'S. Fraser St.', u'S Atherton Street', u'North Atherton St.', u'Houser Road', u'Earlystown Road', u'

## Ammenity

In [18]:
print db.balt.distinct('ammenity')

[]
