In [1]:

import seaborn as sns
import metapack as mp
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 

from demosearch import FileCache
from pathlib import Path

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()

In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()

pkg_root = Path(pkg.path).parent
cache = FileCache(pkg_root.joinpath('data/cache'))

pkg

In [3]:
pkg.reference('naics_index_2007_26').dataframe()

XXXX lib
No module named 'lib'
XXXX pylib
No module named 'pylib.clusters'


AssertionError: SHould not get here

In [None]:
t = pkg.reference('naics_index_2007_26').dataframe()
t.columns = ['seq','naics_code','naics_desc','a','b','c']
naics = t

def int_maybe(v):
    try:
        return int(v)
    except:
        return v

naics_map = { int_maybe(r.naics_code):r.naics_desc for idx, r in naics.iterrows()}

In [None]:
sd_county = pkg.reference('sd_county_boundary').geoframe()
bs = pkg.reference('business_sites').geoframe()
bc = pkg.reference('business_clusters').geoframe()

In [None]:
sdb_ak = pkg.reference('sd_businesses_ak').dataframe()
sdb_lz = pkg.reference('sd_businesses_lz').dataframe()

sdb = pd.concat([sdb_ak, sdb_lz])
sb_mbl = sdb # The name in the metadata
sdb.head()

In [None]:
nl = pd.DataFrame( {
    'account' :sb_mbl['BUSINESS ACCT#'],
    'naics'   :sb_mbl['NAICS'],
    'naics_2' :sb_mbl['NAICS'].astype(str).str.slice(0,2),
    'naics_3' :sb_mbl['NAICS'].astype(str).str.slice(0,3),
    'naics_4' :sb_mbl['NAICS'].astype(str).str.slice(0,4),
    'naics_5' :sb_mbl['NAICS'].astype(str).str.slice(0,5),
    'naics_6' :sb_mbl['NAICS'].astype(str).str.slice(0,6),
})

for c in nl.columns[1:]:
    dc = c+"_desc"
    nl[dc] = nl[c].apply(lambda v: naics_map.get(int_maybe(v)) )

naics = nl

In [None]:
naics_map.get(72241)

In [None]:
t  = gpd.sjoin(bc, sd_county)
sd_business_clusters = t[['cluster_n','geometry']]

In [None]:
#http://barker:4000/v1/search?text=1370%20Wilbur,%20San%20Diego,%20CA
import requests
import urllib.parse
import hashlib 
from demosearch.util import run_mp
def geocode(cache, text):
    
    text_enc = urllib.parse.quote_plus(text)
    text_h = hashlib.sha224(text_enc.encode('utf8')).hexdigest()
    
    key = f"geocode/{text_h[0:2]}/{text_h[2:4]}/{text_h}"
    
    if cache.exists(key):
        return cache.get(key)
    else:
        r = requests.get('http://barker:4000/v1/search?text='+text_enc)
        r.raise_for_status()
        j = r.json()
        
        cache.put(key, j)
        return j
    
def geocode_chunk(cache, frame_key):
    from time import sleep
    from random import uniform
    
        
    df = cache.get(frame_key)
    results = []
    for idx, row in df.iterrows():
        addr = f"{row.ADDRESS} {row.CITY} {row.ZIP}, {row.STATE}"
    
        sleep_time = 1
        for i in range(3):
    
            try:
                j = geocode(cache, addr)
                r = j['features'][0]['geometry']
            except IndexError as e:
                
                break
            except Exception as e:
                if e.response is not None and e.response.status_code == 400:
                    print("Timeout: Iter",i,' sleep', sleep_time, e)
                    sleep(sleep_time+uniform(0,3))
                    sleep_time *= 2
                       
                else:
                    r = e
                    break
            
        results.append( (row['BUSINESS ACCT#'], addr, r))

    return results

chunk_size = int(len(sdb)/100)


frames = [sdb[i:i+chunk_size] for i in range(0,sdb.shape[0],chunk_size)]
tasks = []
for i, f in enumerate(frames):
    key =  f'chunks/{i}'
    
    cache.put_df(key, f)
    tasks.append( (cache, key))

print("Start MP run")
r = run_mp(geocode_chunk, tasks, n_cpu=4)

from itertools import chain
import geopandas as gpd
rows = [ (e[0],e[1],e[2]['coordinates'][1], e[2]['coordinates'][0]) for e in list(chain(*r))]

gcodes = pd.DataFrame(rows, columns='account gc_address lat lon'.split())
gcodes['geometry'] = gpd.points_from_xy(gcodes.lon, gcodes.lat)
gcodes = gpd.GeoDataFrame(gcodes, crs=4326)

t = gcodes.merge(sdb, left_on='account', right_on='BUSINESS ACCT#')
t  = gpd.sjoin(t, sd_county)

In [None]:
cols = ['account', 'gc_address', 'lat', 'lon',  'DBA NAME', 'OWNERSHIP TYPE',
         'CREATION DT', 'START DT', 'EXP DT', 'OWNER NAME','NAICS', 'ACTIVITY DESC', 'geometry']
sd_businesses = t[cols]

In [None]:
blocks = pkg.reference('census_bg').geoframe()
pop =  pkg.reference('total_population').dataframe()
t = blocks.merge(pop.reset_index())
cols = ['geoid',  'aland','b01003_001', 'intptlat', 'intptlon', 'geometry']
blocks = t[cols].rename(columns={'b01003_001':'pop', "aland":'area'}).to_crs(4326)


In [None]:
# Merge blockgroup population into sd_businesses
from geoid.censusnames import stusab
import rowgenerators as rg


t = gpd.sjoin(sd_businesses, blocks)
sd_businesses = t[list(sd_businesses.columns) + ['geoid','pop', 'area']]

sd_businesses.head()

In [None]:
t = gpd.sjoin(sd_businesses, sd_business_clusters, how='left')
cols = ['account', 'gc_address', 'lat', 'lon', 'geoid', 'pop', 'area', 'DBA NAME', 'OWNERSHIP TYPE',
         'CREATION DT', 'START DT', 'EXP DT', 'OWNER NAME','NAICS', 'ACTIVITY DESC', 
        'cluster_n','geometry']
t['cluster_n'] = t.cluster_n.fillna(0)
sd_custered_businesses = t[cols]


In [None]:
bid = pkg.reference('bid').geoframe().to_crs(4326)
bid = bid[['NAME2','SYMBOL', 'geometry']].rename(columns={'NAME2':'bid_name','SYMBOL':'bid_code'})
bid.plot(column='bid_name')


In [None]:
t = gpd.sjoin(sd_custered_businesses, bid, how='left').drop(columns=['index_right'])
sd_custered_businesses = t[ [c for c in t.columns if c != 'geometry'] + ['geometry']]

In [None]:
sd_custered_businesses.head().T

In [None]:
!pwd
