In [2]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 

from tqdm.notebook import tqdm
tqdm.pandas()
from geoid.censusnames import stusab
import rowgenerators as rg

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()


In [3]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg

In [4]:
ptm = pkg.reference('puma_tract_map').dataframe()
ptm = ptm.rename(columns={'tract':'geoid','puma':'PUMA'})

In [5]:
frames = [rg.dataframe(pkg.reference('pums').url.format(st=st), low_memory=False) for st in tqdm(list(stusab.values()))]
pums = pd.concat(frames)

  0%|          | 0/52 [00:00<?, ?it/s]

In [7]:
pumx = pums[['ST','PUMA','HINCP', 'WGTP']].copy()

In [8]:
pumx.memory_usage()

Unnamed: 0,ST,PUMA,HINCP,WGTP
0,1,1000,,14
1,1,2701,52450.0,9
2,1,400,,11
3,1,1000,8800.0,15
4,1,1000,13200.0,29


In [None]:
from geoid.acs import Puma
pums['PUMA'] = pums.progress_apply(lambda r: str(Puma(r.ST, r.PUMA)), axis=1)

  0%|          | 0/7691157 [00:00<?, ?it/s]

In [None]:
%time hh = pkg.reference('households').dataframe().rename(columns={'b11001_001':'households'})
hh = hh[['households']].copy()

In [None]:
%time mi = pkg.reference('median_income').dataframe().rename(columns={'b19013_001':'median_income'})
mi = mi[['median_income']].copy()
mi

In [None]:
t = pums[['PUMA','HINCP', 'WGTP']].dropna()
t = t.sample(int(10e6), replace=True, weights=t.WGTP)

In [None]:
step = 5_000

# Clip so we don't deal with crazy extremes
t['HINCP'] = t.HINCP.clip(-step, 500_000)

# Quantize the median incomes of each puma. This becomes the index we will use
# to match tracts to PUMA distributions
t['medinc'] = t.groupby('PUMA').HINCP.transform(lambda g: (g.median()/step).round()*step).astype(int)
t.head()

In [None]:
# Build the list of bin boundaries
inc_bins = np.arange(-step, t.HINCP.max()+step, step)

# Assign the household incomes to bins
t['inc_bin'] = pd.cut(t.HINCP, inc_bins).apply(lambda e: e.left)
t.head()

In [None]:
# Group by the median income index ( and across pums ) and count up the number of people

medinc_bins = t.groupby(['PUMA','medinc']).inc_bin.value_counts().unstack().fillna(0)
medinc_bins = medinc_bins.divide(t.groupby(['PUMA','medinc']).inc_bin.count(), axis=0)
medinc_bins.columns = list(medinc_bins.columns)
medinc_bins.sort_index(level=['medinc', ]).head(10)

In [None]:
l = [e[1] for e in medinc_bins.index]
mi_max = max(l)
mi_min = min(l)

In [None]:
mi['medinc'] = (mi.median_income/step).clip(mi_min, mi_max).round().fillna(0).astype(int)

t = mi.join(ptm.set_index('geoid')).join(hh).reset_index()# .set_index('geoid')
t#.merge(medinc_bins.reset_index(), on=['PUMA','medinc'])

In [None]:
medinc_bins.reset_index()

In [None]:
t.iloc[:,3:]= t.iloc[:,3:].multiply(t.households, axis=0)
t.fillna(0).to_csv('income_counts.csv')
t = t.set_index('PUMA', append=True)
tracts = pkg.reference('us_tracts').dataframe()

In [None]:
tracts = tracts[['geoid','tract_id']]

tracts = tracts.merge(ptm).set_index(['geoid','PUMA'])
tracts.head()

In [None]:
tract_income_dist = tracts.set_index('geoid').join(t)#.drop(columns=['medinc','households'])
tract_income_dist.insert(0,'offset',0)

In [None]:
tract_income_dist

In [None]:
income_ranges = tract_income_dist[['tract_id']].copy()
income_ranges['lt25k'] = tract_income_dist.loc[:,-5_000:20_000].sum(axis=1)
income_ranges['25k_50k'] = tract_income_dist.loc[:,25_000:45_000].sum(axis=1)
income_ranges['50k_75k'] = tract_income_dist.loc[:,50_000:70_000].sum(axis=1)
income_ranges['75k_120k'] = tract_income_dist.loc[:,75_000:115_000].sum(axis=1)
income_ranges['gt120k'] = tract_income_dist.loc[:,120_000:].sum(axis=1)
income_ranges.describe()

In [None]:
income_ranges = tract_income_dist[['tract_id']].copy()
income_ranges['lt25k'] = tract_income_dist.loc[:,-5_000:20_000].sum(axis=1)
income_ranges['25k_60k'] = tract_income_dist.loc[:,25_000:55_000].sum(axis=1)
income_ranges['60k_100k'] = tract_income_dist.loc[:,60_000:95_000].sum(axis=1)
income_ranges['100k_200k'] = tract_income_dist.loc[:,100_000:195_000].sum(axis=1)
income_ranges['gt200k'] = tract_income_dist.loc[:,200_000:].sum(axis=1)

In [None]:
income_ranges.describe()