In [1]:

import dask
import dask.dataframe as dd
import geopandas as gpd
import libgeohash as gh
import matplotlib.pyplot as plt
import metapack as mp
import numpy as np
import pandas as pd
import rowgenerators as rg
import seaborn as sns
import utm
from IPython.display import display 
from demosearch import FileCache
from demosearch.util import munge_pbar,  run_mp, gh_path, disaggregate
from geoid.censusnames import stusab
from itertools import chain 
import shapely
from shapely.geometry import LineString, Point, Polygon
from shapely.wkt import loads as loads_wkt
from itertools import chain 

from tqdm.notebook import tqdm
tqdm.pandas()

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()

utm_crs = 26911


In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg

In [4]:

hashes = pkg.reference('us_geohashes').geoframe()

In [7]:
%%time

from pathlib import Path
fp = pkg.reference('lines').resolved_url.fspath
op = Path.cwd().parent.joinpath('data','cache', 'lines')
if not op.exists():
    op.mkdir(parents=True)
op

#
# Write out the lines files into chunks so we can run it in multiple
# processes
frames = []
chunksize = 10000
total = int(53065618/ chunksize)
with pd.read_csv(fp, chunksize=chunksize, low_memory=False) as reader:
    for i, df in tqdm(enumerate(reader), total=total):
        p = op.joinpath(f"{i}.feather")
        if not op.exists():
            df.reset_index().to_feather(p)

  0%|          | 0/5306 [00:00<?, ?it/s]

CPU times: user 2min 53s, sys: 31 s, total: 3min 24s
Wall time: 3min 24s


In [66]:
hw_type = {
    'residential': 'r',
    'primary':'1',
    'secondary':'2',
    'tertiary':'3',
    'motorway':'m',
    'motorway_link ':'l',
    'trunk':'t'
}
    

In [8]:
%%time

# Process each of the seperate files, then
# write them back out for later recombination

cache = FileCache(Path.cwd().parent.joinpath('data','cache'))
tasks = [ [e] for e in list(op.glob("**/*.feather"))]
    
def _f(fn):

    t = pd.read_feather(fn)
    t = t[t.highway.isin(list(hw_type.keys()))]
    t['highway'] = t.highway.replace(hw_type) # Cuts file size by 100M
    t['geometry'] = t.geometry.apply(shapely.wkt.loads)
    gdf = gpd.GeoDataFrame(t, crs=4326)
    t = gpd.overlay(gdf, hashes)
    
    t = t[['osm_id','geohash','utm_epsg','utm_area','highway','geometry']]
    
    key = f"recombine/{fn.stem}"

    cache.put_df(key, t)
    
    return key
  
recombine_keys = run_mp(_f, tasks, desc='Split By Geohash')

Split By Geohash:   0%|          | 0/531 [00:00<?, ?it/s]

CPU times: user 722 ms, sys: 230 ms, total: 953 ms
Wall time: 13min 55s


In [9]:
%%time 

def _f(key):
    df  = cache.get_df(key)
    okeys = []
    errs = []
    for idx, g in df.groupby('utm_epsg'):
        _, fn = key.split('/')
        okey = f'epsg/{idx}/{fn}'
        
        try:
             
            geometry = g.to_crs(epsg=idx).geometry\
                            .simplify(20, False)\
                            .apply( lambda e: shapely.wkt.dumps(e, rounding_precision=0) )

            g = pd.DataFrame(g).assign(geometry=geometry)
            
            cache.put_df(okey, g)
            okeys.append(okey)
        except AttributeError:
            print(f'!!! Error converting {idx} in {key}')
            errs.append((key,okey))
    return okeys

epsg_keys = run_mp(_f, [ (e,) for e in recombine_keys], desc='Split By Geohash') 
 

Split By Geohash:   0%|          | 0/531 [00:00<?, ?it/s]

CPU times: user 408 ms, sys: 134 ms, total: 542 ms
Wall time: 4min 41s


In [20]:
%%time
ek = list(chain(*epsg_keys))
frames = [cache.get_df(e) for e in tqdm(ek)]
t = pd.concat(frames)
residential = t[t.highway == 'r']
nonres = t[t.highway != 'r']

  0%|          | 0/4856 [00:00<?, ?it/s]

CPU times: user 5.87 s, sys: 1.44 s, total: 7.31 s
Wall time: 10.4 s


In [None]:

%time residential.to_csv('../data/residential.csv')
%time nonres.to_csv('../data/nonres.csv')