In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 

from tqdm.notebook import tqdm
tqdm.pandas()
from geoid.censusnames import stusab
import rowgenerators as rg
from geoid.acs import Puma

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()


In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg

In [3]:
ptm = pkg.reference('puma_tract_map').dataframe()
ptm = ptm.rename(columns={'tract':'geoid','puma':'PUMA'})

In [4]:
%%time
from pathlib import Path
p = Path('pums.pkl')
if p.exists():
    pums = pd.read_pickle(p)
else:
    frames = [rg.dataframe(pkg.reference('pums').url.format(st=st), low_memory=False) for st in tqdm(list(stusab.values()))]
    pums = pd.concat(frames)
    pums = pums[['ST','PUMA','HINCP', 'WGTP']]
    pums['PUMA'] = pums.progress_apply(lambda r: str(Puma(r.ST, r.PUMA)), axis=1)
    pums.to_pickle('pums.pkl')

CPU times: user 703 ms, sys: 338 ms, total: 1.04 s
Wall time: 1.05 s


In [5]:
pums.head()

Unnamed: 0,ST,PUMA,HINCP,WGTP
0,1,79500US0101000,,14
1,1,79500US0102701,52450.0,9
2,1,79500US0100400,,11
3,1,79500US0101000,8800.0,15
4,1,79500US0101000,13200.0,29


In [6]:
%%time
cf = {
    'households':'hh',
    'median_income':'mi',
    'agg_by_quintile':'abq',
    'agg_income':'ai',
    'agg_hh_income':'ahhi',
    
}
census = {}


for resource_name, file_name in cf.items():

    p = Path(file_name+'.csv')
    if not p.exists():
        print("Loading ", resource_name)
        df = pkg.reference(resource_name).dataframe() 
        df.to_csv(str(p))
    else:
        print("Reading ", resource_name)
        df = pd.read_csv(p, index_col=False)

    census[resource_name] = df

mi = census['median_income']
hh = census['households']
agg = census['agg_income']
agg_hh = census['agg_hh_income']
abq = census['agg_by_quintile']


Reading  households
Reading  median_income
Reading  agg_by_quintile
Reading  agg_income
Reading  agg_hh_income
CPU times: user 580 ms, sys: 90.5 ms, total: 670 ms
Wall time: 677 ms


In [7]:
col_map = {
 'b11001_001': 'households',
 'b19013_001': 'median_income',
 'b19313_001': 'agg_income',
 'b19025_001': 'agg_hh_income',
 'b19082_001': 'hhinc_00',
 'b19082_002': 'hhinc_20',
 'b19082_003': 'hhinc_40',
 'b19082_004': 'hhinc_60',
 'b19082_005': 'hhinc_80',
 'b19082_006': 'hhinc_95'

}

t = hh.merge(mi, on='geoid').merge(agg, on='geoid').merge(agg_hh, on='geoid').merge(abq, on='geoid')

census = t[ ['geoid']+list(col_map.keys())].rename(columns=col_map)


In [8]:
census.agg_hh_income.describe()

count    7.298900e+04
mean     1.471310e+08
std      1.159368e+08
min      1.517000e+05
25%      7.289280e+07
50%      1.179102e+08
75%      1.868091e+08
max      3.698052e+09
Name: agg_hh_income, dtype: float64

In [9]:
# Convert the percentages of total income that each quantile has to the
# income in that quantile. 
quant_cols = [c for c in census if c.startswith('hhinc')]
census.loc[:,quant_cols] = (census.loc[:,quant_cols]/100).multiply(census.agg_hh_income, axis=0)

In [10]:
# This may not be the correct weighting -- maybe
# The weights are only vild within the PUMA?
samp = pums.dropna()
samp = samp.sample(int(10e6), replace=True, weights=samp.WGTP)

In [11]:
step = 5_000

# Clip so we don't deal with crazy extremes
samp['HINCP'] = samp.HINCP.clip(-step, 500_000)

# Quantize the median incomes of each puma. This becomes the index we will use
# to match tracts to PUMA distributions
samp['medinc'] = samp.groupby('PUMA').HINCP.transform(lambda g: (g.median()/step).round()*step).astype(int)
samp.head()

Unnamed: 0,ST,PUMA,HINCP,WGTP,medinc
155749,53,79500US5311615,148800.0,41,100000
178430,26,79500US2601702,60800.0,79,40000
63624,20,79500US2001301,55200.0,45,65000
312251,42,79500US4202402,119000.0,45,65000
35038,16,79500US1600600,97800.0,17,55000


In [12]:
# Build the list of bin boundaries
inc_bins = np.arange(-step, samp.HINCP.max()+step, step)

# Assign the household incomes to bins
samp['inc_bin'] = pd.cut(samp.HINCP, inc_bins).apply(lambda e: e.left)
samp.head()

Unnamed: 0,ST,PUMA,HINCP,WGTP,medinc,inc_bin
155749,53,79500US5311615,148800.0,41,100000,145000.0
178430,26,79500US2601702,60800.0,79,40000,60000.0
63624,20,79500US2001301,55200.0,45,65000,55000.0
312251,42,79500US4202402,119000.0,45,65000,115000.0
35038,16,79500US1600600,97800.0,17,55000,95000.0


In [13]:
# Group by the median income index ( and across pums ) and count up the number of people

medinc_bins = samp.groupby('medinc').inc_bin.value_counts().unstack().fillna(0)
medinc_bins = medinc_bins.divide(samp.groupby('medinc').inc_bin.count(), axis=0)
medinc_bins.columns = list(medinc_bins.columns)
medinc_bins.sort_index(level=['medinc', ]).head(10)

Unnamed: 0_level_0,-5000.0,0.0,5000.0,10000.0,15000.0,20000.0,25000.0,30000.0,35000.0,40000.0,...,450000.0,455000.0,460000.0,465000.0,470000.0,475000.0,480000.0,485000.0,490000.0,495000.0
medinc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15000,0.086783,0.106144,0.153823,0.137921,0.105635,0.081923,0.063631,0.047755,0.042743,0.032362,...,0.0,0.0,0.0,0.0,5.1e-05,2.5e-05,0.0,0.0,0.0,0.000153
20000,0.067045,0.090616,0.127679,0.12063,0.106564,0.087459,0.064175,0.056041,0.050683,0.03735,...,0.0,0.0,0.0,6.4e-05,0.0,0.0,0.0,0.000223,0.0,0.000383
25000,0.045843,0.060721,0.109892,0.111556,0.092616,0.076451,0.066743,0.057709,0.053985,0.042376,...,5.9e-05,4e-05,4e-05,9.9e-05,7.9e-05,0.000139,0.0,0.0,0.0,0.000812
30000,0.037473,0.042094,0.098458,0.093903,0.08591,0.074366,0.068515,0.055271,0.052936,0.043308,...,0.000188,6.6e-05,0.00016,0.000171,5.5e-05,0.000105,0.000121,5e-05,4.4e-05,0.001369
35000,0.030342,0.035674,0.074999,0.084779,0.077028,0.071244,0.06676,0.058176,0.055154,0.046201,...,8e-05,0.00011,0.000146,0.000101,0.000131,0.000146,0.000161,0.000107,0.000125,0.002092
40000,0.02284,0.028632,0.057536,0.074109,0.07063,0.06844,0.064529,0.058052,0.058347,0.048513,...,0.00014,0.000189,0.000115,0.000162,0.000165,0.00015,0.000137,0.000157,0.000171,0.002368
45000,0.018184,0.025404,0.047359,0.063497,0.064148,0.063321,0.061245,0.05516,0.056526,0.048688,...,0.000192,0.000148,0.000234,0.000217,0.000205,0.000209,0.000217,0.000158,0.000153,0.002941
50000,0.014732,0.021874,0.039139,0.054526,0.055932,0.057206,0.057181,0.052622,0.054601,0.047258,...,0.000238,0.000204,0.000202,0.00025,0.000195,0.000221,0.000213,0.000165,0.000185,0.003758
55000,0.012912,0.018973,0.033315,0.047048,0.049997,0.051231,0.052513,0.048182,0.052045,0.045211,...,0.000306,0.000244,0.000278,0.000269,0.000244,0.000314,0.000232,0.00026,0.000254,0.004546
60000,0.011845,0.017818,0.029527,0.04143,0.044681,0.046246,0.048349,0.04446,0.048089,0.042958,...,0.000263,0.000283,0.000315,0.000372,0.000273,0.000375,0.00031,0.000279,0.000279,0.005683


In [14]:

mi_max = max(medinc_bins.index)
mi_min = min(medinc_bins.index)

In [15]:
census['medinc'] = (census.median_income/step).clip(mi_min, mi_max).round().fillna(0).astype(int)
census.head()

Unnamed: 0,geoid,households,median_income,agg_income,agg_hh_income,hhinc_00,hhinc_20,hhinc_40,hhinc_60,hhinc_80,hhinc_95,medinc
0,14000US01001020100,709,60208.0,62231000.0,58707900.0,1702529.1,4091940.63,8471549.97,13491075.42,30956680.0,13256240.0,15000
1,14000US01001020200,688,43958.0,40718100.0,36164700.0,1348943.31,3149945.37,6025039.02,9203916.15,16436860.0,5265580.0,15000
2,14000US01001020300,1360,55345.0,86089300.0,82071900.0,3373155.09,8338505.04,14280510.6,20091201.12,35988530.0,11514690.0,15000
3,14000US01001020400,1675,59663.0,125932600.0,123080000.0,6363236.0,14264972.0,19988192.0,29403812.0,53059790.0,20172810.0,15000
4,14000US01001020500,4483,66108.0,414151800.0,403905900.0,12924988.8,41319573.57,62645805.09,88980469.77,198035100.0,108933400.0,15000


In [16]:
t = census.reset_index().merge(medinc_bins.reset_index(), on='medinc')

t.loc[:,-5000:]= t.loc[:,-5000:].multiply(t.households, axis=0)

tracts = pkg.reference('us_tracts').dataframe()
tracts = tracts[['geoid','tract_id']]
tract_income_dist = tracts.merge(t)#.drop(columns=['medinc','households'])
tract_income_dist = tract_income_dist.rename(columns={ c:str(int(c//1000))+'k' for c in tract_income_dist.loc[:,-5000:].columns })
tract_income_dist.head()

Unnamed: 0,geoid,tract_id,index,households,median_income,agg_income,agg_hh_income,hhinc_00,hhinc_20,hhinc_40,...,450k,455k,460k,465k,470k,475k,480k,485k,490k,495k
0,14000US01055010700,0,318,1376,46078.0,81490500.0,79543200.0,4422601.92,8948610.0,12901907.04,...,0.0,0.0,0.0,0.0,0.070017,0.035008,0.0,0.0,0.0,0.21005
1,14000US01055001300,1,306,1092,22946.0,42149100.0,35495200.0,1508546.0,3247810.8,4976427.04,...,0.0,0.0,0.0,0.0,0.055565,0.027783,0.0,0.0,0.0,0.166696
2,14000US01055000900,2,302,898,30238.0,43570000.0,34296100.0,1755960.32,3549646.35,5219866.42,...,0.0,0.0,0.0,0.0,0.045694,0.022847,0.0,0.0,0.0,0.137082
3,14000US01055001700,3,308,684,24462.0,29897100.0,25409600.0,1384823.2,2492681.76,3402345.44,...,0.0,0.0,0.0,0.0,0.034805,0.017402,0.0,0.0,0.0,0.104414
4,14000US01055010501,4,314,3200,73248.0,294057000.0,277982800.0,11897663.84,29215992.28,46728908.68,...,0.0,0.0,0.0,0.0,0.162829,0.081415,0.0,0.0,0.0,0.488487


In [17]:
income_ranges = tract_income_dist[['geoid']].copy()
income_ranges['lt25k'] = tract_income_dist.loc[:,'-5k':'20k'].sum(axis=1)
income_ranges['25k_50k'] = tract_income_dist.loc[:,'25k':'45k'].sum(axis=1)
income_ranges['50k_75k'] = tract_income_dist.loc[:,'50k':'70k'].sum(axis=1)
income_ranges['75k_120k'] = tract_income_dist.loc[:,'75k':'115k'].sum(axis=1)
income_ranges['gt120k'] = tract_income_dist.loc[:,'120k':].sum(axis=1)
income_ranges['gt50k'] = tract_income_dist.loc[:,'50k':].sum(axis=1)
income_ranges['gt60k'] = tract_income_dist.loc[:,'60k':].sum(axis=1)
income_quartiles = income_ranges
income_quartiles.describe()


Unnamed: 0,lt25k,25k_50k,50k_75k,75k_120k,gt120k,gt50k,gt60k
count,72913.0,72913.0,72913.0,72913.0,72913.0,72913.0,72913.0
mean,1124.185362,362.630832,109.091336,52.205565,24.209427,185.506327,127.641968
std,548.542609,176.944541,53.230764,25.473536,11.812911,90.517212,62.282485
min,8.06676,2.602112,0.782801,0.374609,0.173718,1.331128,0.915914
25%,753.569826,243.0806,73.126676,34.994708,16.22819,124.349574,85.561633
50%,1045.989874,337.407149,101.503218,48.574278,22.52548,172.602977,118.763516
75%,1397.566162,450.81585,135.620303,64.90098,30.096705,230.617988,158.682102
max,14257.325989,4599.015571,1383.535835,662.089887,307.032718,2352.65844,1618.801679


In [18]:
income_quartiles.head()

Unnamed: 0,geoid,lt25k,25k_50k,50k_75k,75k_120k,gt120k,gt50k,gt60k
0,14000US01055010700,924.988475,298.375474,89.761201,42.955146,19.919705,152.636051,105.024806
1,14000US01055001300,734.075156,236.792164,71.234907,34.089403,15.80837,121.13268,83.348175
2,14000US01055000900,603.662537,194.724692,58.579621,28.033227,12.999924,99.612772,68.540898
3,14000US01055001700,459.805317,148.320366,44.619667,21.352703,9.901946,75.874316,52.207098
4,14000US01055010501,2151.135988,693.896451,208.746979,99.895688,46.324895,354.967561,244.243735


In [19]:
income_ranges

Unnamed: 0,geoid,lt25k,25k_50k,50k_75k,75k_120k,gt120k,gt50k,gt60k
0,14000US01055010700,924.988475,298.375474,89.761201,42.955146,19.919705,152.636051,105.024806
1,14000US01055001300,734.075156,236.792164,71.234907,34.089403,15.808370,121.132680,83.348175
2,14000US01055000900,603.662537,194.724692,58.579621,28.033227,12.999924,99.612772,68.540898
3,14000US01055001700,459.805317,148.320366,44.619667,21.352703,9.901946,75.874316,52.207098
4,14000US01055010501,2151.135988,693.896451,208.746979,99.895688,46.324895,354.967561,244.243735
...,...,...,...,...,...,...,...,...
72908,14000US72005400900,711.891566,229.636357,69.082203,33.059229,15.330645,117.472077,80.829411
72909,14000US72127008900,751.553136,242.430073,72.930976,34.901056,16.184760,124.016792,85.332655
72910,14000US72127009000,526.356087,169.787788,51.077776,24.443226,11.335123,86.856125,59.763389
72911,14000US72119130702,1037.250884,334.588195,100.655184,48.168452,22.337285,171.160921,117.771276


In [20]:
tract_income_dist

Unnamed: 0,geoid,tract_id,index,households,median_income,agg_income,agg_hh_income,hhinc_00,hhinc_20,hhinc_40,...,450k,455k,460k,465k,470k,475k,480k,485k,490k,495k
0,14000US01055010700,0,318,1376,46078.0,81490500.0,79543200.0,4422601.92,8948610.00,12901907.04,...,0.0,0.0,0.0,0.0,0.070017,0.035008,0.0,0.0,0.0,0.210050
1,14000US01055001300,1,306,1092,22946.0,42149100.0,35495200.0,1508546.00,3247810.80,4976427.04,...,0.0,0.0,0.0,0.0,0.055565,0.027783,0.0,0.0,0.0,0.166696
2,14000US01055000900,2,302,898,30238.0,43570000.0,34296100.0,1755960.32,3549646.35,5219866.42,...,0.0,0.0,0.0,0.0,0.045694,0.022847,0.0,0.0,0.0,0.137082
3,14000US01055001700,3,308,684,24462.0,29897100.0,25409600.0,1384823.20,2492681.76,3402345.44,...,0.0,0.0,0.0,0.0,0.034805,0.017402,0.0,0.0,0.0,0.104414
4,14000US01055010501,4,314,3200,73248.0,294057000.0,277982800.0,11897663.84,29215992.28,46728908.68,...,0.0,0.0,0.0,0.0,0.162829,0.081415,0.0,0.0,0.0,0.488487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72908,14000US72005400900,73996,73080,1059,9180.0,15483000.0,14956600.0,10469.62,822613.00,1974271.20,...,0.0,0.0,0.0,0.0,0.053886,0.026943,0.0,0.0,0.0,0.161659
72909,14000US72127008900,73997,73825,1118,11151.0,21579200.0,21189400.0,50854.56,1184487.46,2515181.78,...,0.0,0.0,0.0,0.0,0.056888,0.028444,0.0,0.0,0.0,0.170665
72910,14000US72127009000,73998,73826,783,12044.0,14271200.0,13088100.0,90307.89,918784.62,1904318.55,...,0.0,0.0,0.0,0.0,0.039842,0.019921,0.0,0.0,0.0,0.119527
72911,14000US72119130702,73999,73710,1543,21911.0,54237600.0,48334500.0,1285697.70,4205101.50,6713662.05,...,0.0,0.0,0.0,0.0,0.078514,0.039257,0.0,0.0,0.0,0.235543
