# Create Percentile Scores

Create the Percentile data, from scores cached in the Score notebook. 


In [9]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()

from demosearch import *
from pathlib import Path

cache_dir = '/Volumes/SSD_Extern/radius'
central_cache = FileCache(cache_dir)
rm = RasterManager(central_cache)


In [8]:
pkg = mp.jupyter.open_source_package()
pkg.set_sys_path()
import  pylib 

def get_cache(pkg):
    return FileCache(Path(pkg.path).parent.joinpath('data', 'cache'))

pkg_root = Path(pkg.path).parent

cache = get_cache(pkg)
pkg
 

In [1]:
"/Users/eric/proj/code-projects/radius-search/radius-collection/staging/civicknowledge.com-percentile-demosearch/data"

'/Users/eric/proj/code-projects/radius-search/radius-collection/staging/civicknowledge.com-percentile-demosearch/data'

In [37]:
demo_cols_l = ['total_population', 'male', 'female', 'over25_college',
       'over25_high_school', 'seniors', 'households', 'households_unmaried',
       'housing_owned_college', 'housing_rented_college', 'agg_income',
       'agg_hh_income']


osm_cols_l = ['primary', 'secondary', 'tertiary', 'trunk', 'highway',
       'entertain', 'restaurant', 'casual', 'shop', 'bar', 'cafe', 'active',
       'travel', 'food']
  
# Programatically seperate point counts from demographic variables. Point counts
# are smaller, but more importantly, their std dec is much smaller, with a gap between the two groups from 
# about 800 to 11,000, so 5,000 is a good dividing line. 

stds = df.drop(columns=['naics']).describe().T.sort_values('std')
    
osm_cols = list(stds[stds['std']<5_000].index)
demo_cols = list(stds[stds['std']>5_000].index)
 
# Check that our programatic seperation works. But this will break when we change the columns in the census dataset 
assert set(demo_cols) == set(demo_cols_l)
assert set(osm_cols) == set(osm_cols_l)
              

In [11]:
%%time
frames = [ central_cache.get_df(f) for f in central_cache.list('business_scores/scores/')]
df = pd.concat(frames)

# All CBSA that have less than 50 records are grouped into a single
# category. 
t = df.groupby('cbsa').count().total_population.reset_index()
small_cbsa = t.loc[t.total_population<50].cbsa.to_list()
df['group_cbsa'] = df.cbsa.replace(small_cbsa, '31000US00000')

len(df)

CPU times: user 9.11 s, sys: 490 ms, total: 9.6 s
Wall time: 10.4 s


241144

In [38]:
# Create percentiles dataset

def make_pctile_df(df):
    """Create the percentiles dataset"""

    frames = []
    for idx, g in df.groupby('group_cbsa'):
        t = np.nanpercentile(g[demo_cols+osm_cols], np.linspace(0,100,101), axis=0)
        pct = pd.DataFrame(t, columns = demo_cols+osm_cols).round(2)
        pct['cbsa'] = idx
        pct.index.name = 'pct'
        frames.append(pct.set_index('cbsa',append=True))

    pct = pd.concat(frames) 
    
    return pct
 
def col_pctile(df, cbsa, col, value):
    """Use the CBSA percentiles map to find the percentile of a column value """
    t = df.loc[(slice(None), cbsa),:]
    return (t[col]-value).abs().argmin()
    
pct = make_pctile_df(df)
    
col_pctile(pct, '31000US31080', 'cafe', 21.0)
    

49

In [48]:
# Check that a sub-sample returns the same results. Can we use fewer records?
for (pct_v, cbsa), row in list(pct.sample(20).iterrows()):
    target_value = row['cafe']
    
    pct_t = make_pctile_df(df.sample(200_000))
   
    v = col_pctile(pct, cbsa, 'cafe', target_value)
    
    print(f'{row.name} target={target_value} target_pct={pct_v} calc_pct={v}')
    
    

(13, '31000US41180') target=0.0 target_pct=13 calc_pct=0
(80, '31000US14380') target=3.41 target_pct=80 calc_pct=76
(75, '31000US12700') target=3.13 target_pct=75 calc_pct=74
(43, '31000US15940') target=0.38 target_pct=43 calc_pct=42
(57, '31000US34980') target=4.41 target_pct=57 calc_pct=57
(2, '31000US45460') target=0.0 target_pct=2 calc_pct=0
(61, '31000US47460') target=6.7 target_pct=61 calc_pct=61
(94, '31000US20500') target=16.27 target_pct=94 calc_pct=94
(93, '31000US42540') target=10.16 target_pct=93 calc_pct=93
(79, '31000US26580') target=2.05 target_pct=79 calc_pct=79
(86, '31000US35620') target=275.45 target_pct=86 calc_pct=86
(56, '31000US21340') target=0.0 target_pct=56 calc_pct=0
(12, '31000US43740') target=0.0 target_pct=12 calc_pct=0
(49, '31000US31300') target=0.0 target_pct=49 calc_pct=0
(55, '31000US46060') target=7.16 target_pct=55 calc_pct=55
(47, '31000US31860') target=0.0 target_pct=47 calc_pct=0
(91, '31000US33260') target=0.57 target_pct=91 calc_pct=56
(82, '31

In [56]:
df[df.group_cbsa == '31000US31300']

Unnamed: 0,total_population,male,female,over25_college,over25_high_school,seniors,households,households_unmaried,housing_owned_college,housing_rented_college,...,bar,cafe,active,travel,food,cbsa,naics,group,geometry,group_cbsa
93,34683.996865,16319.905059,18364.091805,4328.846567,13939.201835,5337.134524,12230.531082,811.988477,1878.603210,765.092662,...,0.0,0.0,0.487739,79.150522,1.991600,31000US31300,452111,shop,POINT (-79.00796 34.63693),31000US31300
28,33288.802457,15579.351587,17709.450870,3999.993385,13205.933048,4993.869635,11641.818049,790.058875,1741.434027,701.639021,...,0.0,0.0,0.569029,71.967213,2.389920,31000US31300,722511,ent,POINT (-79.00035 34.64101),31000US31300
22,24968.527968,11757.804718,13210.723250,3168.502236,9838.630301,3863.292596,8571.636054,572.980125,1433.319524,437.691607,...,0.0,0.0,0.853543,85.717382,1.422571,31000US31300,447190,auto,POINT (-79.00154 34.66540),31000US31300
57,34753.577828,16262.241821,18491.336008,4134.905987,13811.812756,5233.908522,12201.651602,823.689574,1781.366484,762.146382,...,0.0,0.0,0.487739,72.486113,2.389920,31000US31300,441310,auto,POINT (-78.99764 34.63651),31000US31300
95,18137.180545,8557.456673,9579.723872,1914.100618,5781.185986,2199.014198,5702.765807,280.495307,890.863914,293.412083,...,0.0,0.0,0.426771,48.072077,3.414172,31000US31300,722511,ent,POINT (-79.18540 34.67416),31000US31300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,34913.811533,16240.112596,18673.698938,4042.842386,13820.418706,5243.700530,12248.634327,837.409512,1721.642361,754.731336,...,0.0,0.0,0.487739,70.995800,4.097006,31000US31300,722511,ent,POINT (-78.99080 34.63359),31000US31300
73,31852.754594,15039.045559,16813.709035,3909.121146,12843.409518,4865.142207,11137.330346,752.150701,1716.667434,642.607643,...,0.0,0.0,0.487739,87.971820,1.991600,31000US31300,722511,ent,POINT (-79.01714 34.64091),31000US31300
7,35123.308674,16414.428180,18708.880494,4144.915926,13951.931044,5293.293985,12339.969831,833.947649,1776.651988,774.808086,...,0.0,0.0,0.487739,72.169083,2.389920,31000US31300,441110,auto,POINT (-78.99572 34.63498),31000US31300
94,30999.856896,14752.303523,16247.553373,3688.828158,12566.852059,4690.226809,10789.852826,723.612828,1614.069856,618.164960,...,0.0,0.0,0.426771,80.566319,1.991600,31000US31300,722511,ent,POINT (-79.02368 34.63429),31000US31300
