In [22]:
import seaborn as sns
import metapack as mp
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 
from itertools import chain 
from tqdm import tqdm
import libgeohash as gh
import shapely
from shapely.geometry import Point
from shapely.wkt import loads as loads_wkt
tqdm.pandas()

from demosearch.util import munge_pbar,  run_mp, gh_data_path

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()

utm_crs = 26911


In [23]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg

In [3]:
extract_tags = ['amenity', 'tourism', 'shop', 'leisure', 'natural', 'parking']

In [26]:
%%time
df = pkg.reference('points').read_csv(low_memory=False)

CPU times: user 49.5 s, sys: 6.82 s, total: 56.4 s
Wall time: 56.7 s


In [25]:
df.head()

Unnamed: 0,geometry,osm_id,name,highway,waterway,aerialway,barrier,man_made,z_order,other_tags
0,"LINESTRING (-89.5091987 44.5350956,-89.5060015...",379,Jurgella Lane,residential,,,,,3,
1,"LINESTRING (-89.5857576 44.5411258,-89.5844878...",929,North Point Drive,tertiary,,,,,4,
2,"LINESTRING (-89.5494702 44.4817599,-89.5494371...",930,Post Road,primary,,,,,7,"""hgv""=>""designated"",""ref""=>""US 51 Business"",""o..."
3,"LINESTRING (-89.5747358 44.5193199,-89.5746305...",1065,Jefferson Street,tertiary,,,,,4,
4,"LINESTRING (-115.8097543 33.7365272,-115.80980...",1855,,path,,,,,0,


In [5]:
def _extract_tags(df, extract_tags):

    from sqlalchemy.dialects.postgresql import HSTORE

    h = HSTORE()
    f = h.result_processor(None, None)

    # Prune the dataset to just the records that have the tags we want.
    # before getting to the more expensive operation of extracting the tags.
    # This should reduce the dataset from 24M rows to less than 6M.
    t = df.dropna(subset=['other_tags'])
    t = t[t.highway.isnull()]

    flags = [t.other_tags.str.contains(e) for e in extract_tags]
    comb_flags = [any(e) for e in list(zip(*flags))]

    t = t[comb_flags]

    rows = []
    errors = []
    for idx, r in t.set_index('osm_id')[['other_tags']].iterrows():
        try:
            d = f(r.other_tags)
            rows.append([idx] + [d.get(e) for e in extract_tags])
        except TypeError as e:
            errors.append(r, e)

    return (rows, errors)

In [7]:
# Split the file and extract tags in multiprocessing
N_task = 200
tasks = [(e, extract_tags) for e in np.array_split(df, N_task)]

results = run_mp(_extract_tags, tasks, 'Split OSM other_tags')
tags = list(chain(*[e[0] for e in results]))
errors = list(chain(*[e[1] for e in results]))

Split OSM other_tags:   0%|          | 0/200 [00:01<?, ?it/s]

In [8]:
tags_df = pd.DataFrame(tags, columns=['osm_id'] + extract_tags)

# 1/2 the entries, 2.7M are trees and rocks
tags_df = tags_df[~tags_df.natural.isin(['tree', 'rock'])]

tags_df = pd.merge(tags_df, df[['osm_id', 'geometry']], on='osm_id')

def encode(v):
    return gh.encode(*list(map(float, v[7:-1].split()))[::-1])

tags_df['geohash'] = tags_df.geometry.progress_apply(encode)


tags_df['geometry'] = tags_df.geometry.progress_apply(shapely.wkt.loads)

tags_df = gpd.GeoDataFrame(tags_df, geometry='geometry', crs=4326)


                                                            

In [13]:
tags_df.head().fillna(0)

Unnamed: 0,osm_id,amenity,tourism,shop,leisure,natural,parking,geometry,geohash
0,699684,library,0,0,0,0,0,POINT (-87.95731 42.97439),dp9kqrprynsu
1,14930792,police,0,0,0,0,0,POINT (-80.32974 43.41637),dpwyup9wtm5t
2,14956836,0,viewpoint,0,0,0,0,POINT (-80.30400 43.43688),dpwzhekqjjk7
3,15972546,bank,0,0,0,0,0,POINT (-122.02073 36.97937),9q94r7ghqvxz
4,16878445,0,viewpoint,0,0,0,0,POINT (-64.57743 45.81569),f84hds2pub1y


In [19]:
tags_df['class'] = tags_df.loc[:, ('amenity', 'tourism', 'shop', 'leisure', 'natural', 'parking')].fillna(
    method='ffill', axis=1).fillna(method='bfill', axis=1).iloc[:, 0]

replace = {'parking': 'parking_space',
           'pub': 'bar',
           }
cls = ['restaurant', 'bar', 'cafe', 'fast_food', 'supermarket', 'grave_yard', 'playground',
       'bicycle_parking', 'park', 'fuel', 'bank', 'hotel', 'fitness_centre',
       'laundry', 'clothes', 'convenience', 'parking', 'parking_space']

t = tags_df[['geohash', 'class']].replace(replace)
t = t[t['class'].isin(cls)]

cls_df = t.groupby([t.geohash.str.slice(0, 8), 'class']).count().unstack().fillna(0).droplevel(0, axis=1)


# At 8 digits, geohashes are, on average 4m by 20M over the US
# At 6, 146m x 610m
# At 4, 4Km x 20Km
# Clip to 5 because it's really unlikely that there are actually more than 10
# amenities in a cell.

group_counts = tags_df.groupby(tags_df.geohash.str.slice(0, 8))\
    [['amenity', 'tourism', 'shop', 'leisure', 'natural', 'parking']].count().clip(0, 10)

t = group_counts.join(cls_df, how='outer').fillna(0).astype(int)

t['geometry'] = [Point(gh.decode(e)[::-1]) for e in t.index]

In [21]:
geohash_tags = gpd.GeoDataFrame(t, geometry='geometry', crs=4326).to_crs(utm_crs).reset_index()

Unnamed: 0,geohash,amenity,tourism,shop,leisure,natural,parking,bank,bar,bicycle_parking,...,fuel,grave_yard,hotel,laundry,park,parking_space,playground,restaurant,supermarket,geometry
0,87vg4y02,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,POINT (-4341269.130 3182184.459)
1,87vg4ycq,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,POINT (-4340847.522 3182823.917)
2,87vg4z47,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,POINT (-4340631.639 3182879.205)
3,87y5cz1h,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,POINT (-4295505.165 3187171.396)
4,87y5fkz8,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,POINT (-4292212.211 3184033.026)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021411,gwrjd0en,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,POINT (1462691.615 10246784.903)
2021412,gwrxu00h,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,POINT (1416457.719 10248270.884)
2021413,gwryc8st,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,POINT (1434131.372 10257499.194)
2021414,gwrz699z,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,POINT (1423768.002 10255298.600)
