In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 

from tqdm.notebook import tqdm
tqdm.pandas()
from geoid.censusnames import stusab
import rowgenerators as rg
from geoid.acs import Puma
from pathlib import Path

from demosearch import FileCache


%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()


In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
cache = FileCache(Path(pkg.path).parent.joinpath('data', 'cache'))

pkg

In [3]:
ptm = pkg.reference('puma_tract_map').dataframe()
ptm = ptm.rename(columns={'tract':'geoid','puma':'PUMA'})

In [4]:
%%time

k = 'pums'

if cache.exists(k):
    pums = cache.get(k)
    
else:
    frames = [rg.dataframe(pkg.reference('pums').url.format(st=st), low_memory=False) for st in tqdm(list(stusab.values()))]
    pums = pd.concat(frames)
    pums = pums[['ST','PUMA','HINCP', 'WGTP']]
    pums['PUMA'] = pums.progress_apply(lambda r: str(Puma(r.ST, r.PUMA)), axis=1)
    cache.put(k, pums)

CPU times: user 731 ms, sys: 374 ms, total: 1.1 s
Wall time: 1.11 s


In [5]:
pums.head()

Unnamed: 0,ST,PUMA,HINCP,WGTP
0,1,79500US0101000,,14
1,1,79500US0102701,52450.0,9
2,1,79500US0100400,,11
3,1,79500US0101000,8800.0,15
4,1,79500US0101000,13200.0,29


In [6]:
%%time
cf = {
    'households':'hh',
    'median_income':'mi',
    'agg_by_quintile':'abq',
    'agg_income':'ai',
    'agg_hh_income':'ahhi',
    
}
census = {}


for resource_name, file_name in cf.items():

    if not cache.exists(file_name):
        print("Loading ", resource_name)
        df = pkg.reference(resource_name).dataframe() 
        cache.put(file_name, df)
    else:
        print("Reading ", resource_name)
        df = cache.get(file_name)
        
    census[resource_name] = df

mi = census['median_income']
hh = census['households']
agg = census['agg_income']
agg_hh = census['agg_hh_income']
abq = census['agg_by_quintile']


Reading  households
Reading  median_income
Reading  agg_by_quintile
Reading  agg_income
Reading  agg_hh_income
CPU times: user 112 ms, sys: 55 ms, total: 167 ms
Wall time: 165 ms


In [7]:
col_map = {
 'b11001_001': 'households',
 'b19013_001': 'median_income',
 'b19313_001': 'agg_income',
 'b19025_001': 'agg_hh_income',
 'b19082_001': 'hhinc_00',
 'b19082_002': 'hhinc_20',
 'b19082_003': 'hhinc_40',
 'b19082_004': 'hhinc_60',
 'b19082_005': 'hhinc_80',
 'b19082_006': 'hhinc_95'

}

t = hh.merge(mi, on='geoid').merge(agg, on='geoid').merge(agg_hh, on='geoid').merge(abq, on='geoid')

census = t[ ['geoid']+list(col_map.keys())].rename(columns=col_map)


In [8]:
census.agg_hh_income.describe()

count    7.298900e+04
mean     1.471310e+08
std      1.159368e+08
min      1.517000e+05
25%      7.289280e+07
50%      1.179102e+08
75%      1.868091e+08
max      3.698052e+09
Name: agg_hh_income, dtype: float64

In [9]:
# Convert the percentages of total income that each quantile has to the
# income in that quantile. 
quant_cols = [c for c in census if c.startswith('hhinc')]
census.loc[:,quant_cols] = (census.loc[:,quant_cols]/100).multiply(census.agg_hh_income, axis=0)

In [10]:
# This may not be the correct weighting -- maybe
# The weights are only vild within the PUMA?
samp = pums.dropna()
samp = samp.sample(int(10e6), replace=True, weights=samp.WGTP)

In [11]:
step = 5_000

# Clip so we don't deal with crazy extremes
samp['HINCP'] = samp.HINCP.clip(-step, 500_000)

# Quantize the median incomes of each puma. This becomes the index we will use
# to match tracts to PUMA distributions
samp['medinc'] = samp.groupby('PUMA').HINCP.transform(lambda g: (g.median()/step).round()*step).astype(int)
samp.head()

Unnamed: 0,ST,PUMA,HINCP,WGTP,medinc
22739,22,79500US2200800,125080.0,14,50000
312114,48,79500US4805305,115000.0,107,125000
131835,6,79500US0603735,248400.0,35,60000
199580,37,79500US3703800,28800.0,13,50000
540659,6,79500US0603703,0.0,53,50000


In [12]:
# Build the list of bin boundaries
inc_bins = np.arange(-step, samp.HINCP.max()+step, step)

# Assign the household incomes to bins
samp['inc_bin'] = pd.cut(samp.HINCP, inc_bins).apply(lambda e: e.left)
samp.head()

Unnamed: 0,ST,PUMA,HINCP,WGTP,medinc,inc_bin
22739,22,79500US2200800,125080.0,14,50000,125000.0
312114,48,79500US4805305,115000.0,107,125000,110000.0
131835,6,79500US0603735,248400.0,35,60000,245000.0
199580,37,79500US3703800,28800.0,13,50000,25000.0
540659,6,79500US0603703,0.0,53,50000,-5000.0


In [13]:
# Group by the median income index ( and across pums ) and count up the number of people

medinc_bins = samp.groupby('medinc').inc_bin.value_counts().unstack().fillna(0)
medinc_bins = medinc_bins.divide(samp.groupby('medinc').inc_bin.count(), axis=0)
medinc_bins.columns = list(medinc_bins.columns)
medinc_bins.sort_index(level=['medinc', ]).head(10)

Unnamed: 0_level_0,-5000.0,0.0,5000.0,10000.0,15000.0,20000.0,25000.0,30000.0,35000.0,40000.0,...,450000.0,455000.0,460000.0,465000.0,470000.0,475000.0,480000.0,485000.0,490000.0,495000.0
medinc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15000,0.084526,0.107901,0.153245,0.135692,0.107254,0.083626,0.064949,0.050238,0.043937,0.030041,...,0.0,0.0,0.0,0.0,2.8e-05,0.0,0.0,0.0,0.0,0.000113
20000,0.066604,0.092688,0.12896,0.122561,0.102014,0.084854,0.064566,0.0559,0.049013,0.03742,...,5.7e-05,0.0,0.0,5.7e-05,2.9e-05,0.0,0.0,5.7e-05,0.0,0.000287
25000,0.042021,0.058897,0.113973,0.107751,0.092358,0.079979,0.065634,0.055947,0.05772,0.043568,...,1.6e-05,1.6e-05,0.0,0.0,6.4e-05,3.2e-05,4.8e-05,0.0,3.2e-05,0.000838
30000,0.039073,0.043047,0.097308,0.094402,0.084753,0.073837,0.068206,0.055845,0.052993,0.043284,...,0.00017,2.4e-05,0.000115,0.000146,8.5e-05,0.000188,0.000133,6.7e-05,6.1e-05,0.001566
35000,0.031181,0.036785,0.077223,0.083759,0.077873,0.070339,0.065789,0.057357,0.055412,0.045429,...,8.4e-05,0.000103,0.000122,0.0001,0.000113,0.000138,0.000116,6.3e-05,0.000122,0.001823
40000,0.022601,0.028268,0.058082,0.073968,0.071383,0.068695,0.065911,0.057966,0.058657,0.048201,...,0.000166,0.000168,0.00016,0.000152,0.000145,0.00018,0.000127,0.000132,0.000152,0.002286
45000,0.017705,0.024744,0.046401,0.064101,0.064002,0.063471,0.062138,0.05546,0.0567,0.048841,...,0.000168,0.000169,0.000211,0.000211,0.000184,0.000187,0.000204,0.000168,0.000153,0.003049
50000,0.014842,0.022205,0.039756,0.055125,0.055942,0.057158,0.056743,0.05284,0.054213,0.047492,...,0.000263,0.000221,0.000223,0.000219,0.000184,0.000221,0.000232,0.000172,0.000184,0.003804
55000,0.012977,0.019077,0.032712,0.046893,0.050119,0.051453,0.052465,0.048751,0.051836,0.045768,...,0.000282,0.000225,0.000295,0.000287,0.00023,0.000354,0.000257,0.000191,0.000209,0.004257
60000,0.011958,0.018167,0.02986,0.041762,0.044968,0.046658,0.048505,0.045215,0.048484,0.042842,...,0.000313,0.000355,0.000265,0.000367,0.00032,0.000351,0.000358,0.000314,0.000266,0.00589


In [14]:

mi_max = max(medinc_bins.index)
mi_min = min(medinc_bins.index)

In [15]:
census['medinc'] = (census.median_income/step).clip(mi_min, mi_max).round().fillna(0).astype(int)
census.head()

Unnamed: 0,geoid,households,median_income,agg_income,agg_hh_income,hhinc_00,hhinc_20,hhinc_40,hhinc_60,hhinc_80,hhinc_95,medinc
0,14000US01001020100,709,60208.0,62231000.0,58707900.0,1702529.1,4091940.63,8471549.97,13491075.42,30956680.0,13256240.0,15000
1,14000US01001020200,688,43958.0,40718100.0,36164700.0,1348943.31,3149945.37,6025039.02,9203916.15,16436860.0,5265580.0,15000
2,14000US01001020300,1360,55345.0,86089300.0,82071900.0,3373155.09,8338505.04,14280510.6,20091201.12,35988530.0,11514690.0,15000
3,14000US01001020400,1675,59663.0,125932600.0,123080000.0,6363236.0,14264972.0,19988192.0,29403812.0,53059790.0,20172810.0,15000
4,14000US01001020500,4483,66108.0,414151800.0,403905900.0,12924988.8,41319573.57,62645805.09,88980469.77,198035100.0,108933400.0,15000


In [16]:
t = census.reset_index().merge(medinc_bins.reset_index(), on='medinc')

t.loc[:,-5000:]= t.loc[:,-5000:].multiply(t.households, axis=0)

tracts = pkg.reference('us_tracts').dataframe()
tracts = tracts[['geoid','tract_id']]
tract_income_dist = tracts.merge(t)#.drop(columns=['medinc','households'])
tract_income_dist = tract_income_dist.rename(columns={ c:str(int(c//1000))+'k' for c in tract_income_dist.loc[:,-5000:].columns })
tract_income_dist.head()

Unnamed: 0,geoid,tract_id,index,households,median_income,agg_income,agg_hh_income,hhinc_00,hhinc_20,hhinc_40,...,450k,455k,460k,465k,470k,475k,480k,485k,490k,495k
0,14000US01055010700,0,318,1376,46078.0,81490500.0,79543200.0,4422601.92,8948610.0,12901907.04,...,0.0,0.0,0.0,0.0,0.038705,0.0,0.0,0.0,0.0,0.15482
1,14000US01055001300,1,306,1092,22946.0,42149100.0,35495200.0,1508546.0,3247810.8,4976427.04,...,0.0,0.0,0.0,0.0,0.030716,0.0,0.0,0.0,0.0,0.122866
2,14000US01055000900,2,302,898,30238.0,43570000.0,34296100.0,1755960.32,3549646.35,5219866.42,...,0.0,0.0,0.0,0.0,0.025259,0.0,0.0,0.0,0.0,0.101038
3,14000US01055001700,3,308,684,24462.0,29897100.0,25409600.0,1384823.2,2492681.76,3402345.44,...,0.0,0.0,0.0,0.0,0.01924,0.0,0.0,0.0,0.0,0.07696
4,14000US01055010501,4,314,3200,73248.0,294057000.0,277982800.0,11897663.84,29215992.28,46728908.68,...,0.0,0.0,0.0,0.0,0.090012,0.0,0.0,0.0,0.0,0.360046


In [17]:
income_ranges = tract_income_dist[['geoid']].copy()
income_ranges['lt25k'] = tract_income_dist.loc[:,'-5k':'20k'].sum(axis=1)
income_ranges['25k_50k'] = tract_income_dist.loc[:,'25k':'45k'].sum(axis=1)
income_ranges['50k_75k'] = tract_income_dist.loc[:,'50k':'70k'].sum(axis=1)
income_ranges['75k_120k'] = tract_income_dist.loc[:,'75k':'115k'].sum(axis=1)
income_ranges['gt120k'] = tract_income_dist.loc[:,'120k':].sum(axis=1)
income_ranges['gt50k'] = tract_income_dist.loc[:,'50k':].sum(axis=1)
income_ranges['gt60k'] = tract_income_dist.loc[:,'60k':].sum(axis=1)
income_quartiles = income_ranges
income_quartiles.describe()


Unnamed: 0,lt25k,25k_50k,50k_75k,75k_120k,gt120k,gt50k,gt60k
count,72913.0,72913.0,72913.0,72913.0,72913.0,72913.0,72913.0
mean,1124.2113,362.161714,111.908393,51.555947,22.485167,185.949507,126.396743
std,548.555265,176.715636,54.605338,25.156558,10.971564,90.73346,61.674882
min,8.066946,2.598745,0.803015,0.369947,0.161346,1.334308,0.906979
25%,753.587213,242.766139,75.015021,34.559253,15.072375,124.646648,84.726928
50%,1046.014008,336.970662,104.124328,47.969846,20.921156,173.01533,117.604906
75%,1397.598408,450.232652,139.122416,64.093387,27.953138,231.16894,157.134061
max,14257.654946,4593.066046,1419.262777,653.8512,285.165031,2358.279008,1603.009282


In [18]:
income_quartiles.head()

Unnamed: 0,geoid,lt25k,25k_50k,50k_75k,75k_120k,gt120k,gt50k,gt60k
0,14000US01055010700,925.009817,297.98948,92.079098,42.420635,18.50097,153.000703,104.000225
1,14000US01055001300,734.092093,236.485837,73.0744,33.665213,14.682456,121.42207,82.535062
2,14000US01055000900,603.676465,194.472786,60.092318,27.684397,12.074034,99.85075,67.87224
3,14000US01055001700,459.815926,148.128491,45.771877,21.087002,9.196703,76.055582,51.697786
4,14000US01055010501,2151.185621,692.99879,214.137436,98.65264,43.025513,355.815589,241.860988


In [19]:
income_ranges

Unnamed: 0,geoid,lt25k,25k_50k,50k_75k,75k_120k,gt120k,gt50k,gt60k
0,14000US01055010700,925.009817,297.989480,92.079098,42.420635,18.500970,153.000703,104.000225
1,14000US01055001300,734.092093,236.485837,73.074400,33.665213,14.682456,121.422070,82.535062
2,14000US01055000900,603.676465,194.472786,60.092318,27.684397,12.074034,99.850750,67.872240
3,14000US01055001700,459.815926,148.128491,45.771877,21.087002,9.196703,76.055582,51.697786
4,14000US01055010501,2151.185621,692.998790,214.137436,98.652640,43.025513,355.815589,241.860988
...,...,...,...,...,...,...,...,...
72908,14000US72005400900,711.907991,229.339287,70.866108,32.647858,14.238756,117.752721,80.040871
72909,14000US72127008900,751.570476,242.116452,74.814267,34.466766,15.032038,124.313071,84.500183
72910,14000US72127009000,526.368232,169.568142,52.396754,24.139068,10.527805,87.063627,59.180361
72911,14000US72119130702,1037.274816,334.155354,103.254395,47.569070,20.746364,171.569829,116.622345


In [20]:
tract_income_dist

Unnamed: 0,geoid,tract_id,index,households,median_income,agg_income,agg_hh_income,hhinc_00,hhinc_20,hhinc_40,...,450k,455k,460k,465k,470k,475k,480k,485k,490k,495k
0,14000US01055010700,0,318,1376,46078.0,81490500.0,79543200.0,4422601.92,8948610.00,12901907.04,...,0.0,0.0,0.0,0.0,0.038705,0.0,0.0,0.0,0.0,0.154820
1,14000US01055001300,1,306,1092,22946.0,42149100.0,35495200.0,1508546.00,3247810.80,4976427.04,...,0.0,0.0,0.0,0.0,0.030716,0.0,0.0,0.0,0.0,0.122866
2,14000US01055000900,2,302,898,30238.0,43570000.0,34296100.0,1755960.32,3549646.35,5219866.42,...,0.0,0.0,0.0,0.0,0.025259,0.0,0.0,0.0,0.0,0.101038
3,14000US01055001700,3,308,684,24462.0,29897100.0,25409600.0,1384823.20,2492681.76,3402345.44,...,0.0,0.0,0.0,0.0,0.019240,0.0,0.0,0.0,0.0,0.076960
4,14000US01055010501,4,314,3200,73248.0,294057000.0,277982800.0,11897663.84,29215992.28,46728908.68,...,0.0,0.0,0.0,0.0,0.090012,0.0,0.0,0.0,0.0,0.360046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72908,14000US72005400900,73996,73080,1059,9180.0,15483000.0,14956600.0,10469.62,822613.00,1974271.20,...,0.0,0.0,0.0,0.0,0.029788,0.0,0.0,0.0,0.0,0.119153
72909,14000US72127008900,73997,73825,1118,11151.0,21579200.0,21189400.0,50854.56,1184487.46,2515181.78,...,0.0,0.0,0.0,0.0,0.031448,0.0,0.0,0.0,0.0,0.125791
72910,14000US72127009000,73998,73826,783,12044.0,14271200.0,13088100.0,90307.89,918784.62,1904318.55,...,0.0,0.0,0.0,0.0,0.022025,0.0,0.0,0.0,0.0,0.088099
72911,14000US72119130702,73999,73710,1543,21911.0,54237600.0,48334500.0,1285697.70,4205101.50,6713662.05,...,0.0,0.0,0.0,0.0,0.043402,0.0,0.0,0.0,0.0,0.173610
