In [1]:
import pandas as pd
import numpy as np
import metapack as mp
from pathlib import Path
from statsmodels.formula.api import ols
import seaborn as sns
import matplotlib.pyplot as plt
from sdipylib.plot import  source_attribution

# /Users/eric/opt/anaconda3/envs/data/lib/python3.7/site-packages/pandas/plotting/_tools.py:307: MatplotlibDeprecationWarning: 
# The rowNum attribute was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use ax.get_subplotspec().rowspan.start instead.
#   layout[ax.rowNum, ax.colNum] = ax.get_visible()
import warnings
warnings.simplefilter("ignore")

large = 22; med = 16; small = 12
params = {'axes.titlesize': large,
          'legend.fontsize': med,
          'figure.figsize': (16, 10),
          'axes.labelsize': med,
          'axes.titlesize': med,
          'xtick.labelsize': med,
          'ytick.labelsize': med,
          'figure.titlesize': large}
plt.rcParams.update(params)
plt.style.use('seaborn-whitegrid')
sns.set_style("white")
%matplotlib inline

%run weights.py
%run lib.py 

source = "Survey of Consumer Finances, 2016 and 2019 pooled"


# Sampling errors

From the documentation:
    
     An estimate of the total standard error attributable to imputation and sampling is given by SQRT((6/5)*imputation variance + sampling variance).


In [67]:
pkg = multi_open('federalreserve.gov-consumer_finances-2016e2019-inherit', print_ref=True)
print(pkg.ref)
pkg

Opening:  index:federalreserve.gov-consumer_finances-2016e2019-inherit
metapack+file:///Users/eric/proj/data-projects/metatab-packages/inequality-collection/federalreserve.gov-consumer_finances-2016-inherit/_packages/federalreserve.gov-consumer_finances-2016e2019-inherit-1.2.5/metadata.csv


In [68]:
pkg = multi_open('federalreserve.gov-consumer_finances', print_ref=True)
pkg

Opening:  index:federalreserve.gov-consumer_finances


In [5]:
w16 = pkg.reference('scf_weights_16').dataframe()
w16.rename(columns={'Y1':'record_id'}, inplace=True)
w16.insert(0,'year',2016)
w16.insert(1,'case_id',w16.record_id//10)
w16.columns = [e.lower() for e in w16.columns]

w19 = pkg.reference('scf_weights_19').dataframe()
w19.rename(columns={'y1':'record_id'}, inplace=True)
w19.insert(0,'year',2019)
w19.insert(1,'case_id',w19.record_id//10)


In [7]:
pkg = multi_open('federalreserve.gov-consumer_finances-2016e2019-inherit', print_ref=True)
df = pkg.resource('inherit_scf_16_19').dataframe()

Opening:  index:federalreserve.gov-consumer_finances-2016e2019-inherit


In [8]:
w = pd.concat([w16, w19]).sort_values(['year','case_id'])
cols = list(e for e in w.columns if not e.startswith('mm'))
w = w[cols]
w = df[['year','case_id']].merge(w, on=['year','case_id'])
df.shape, w.shape

((60125, 92), (60125, 1003))

In [12]:
def make_descriptive_df(df):
    """Get the inheritance set, remove all races ecept black and white, and munge some values"""
    
    
    if weights is None:
        weights = df.wt0

    df = df[df.race.isin(['white','black'])]

    # Count parent's bachelors degrees

    def count_bach(r):
        """Count the number of bachelors degrees"""
        return \
            int(r.ed_father_1 == 'bachelors') + \
            int(r.ed_father_2 == 'bachelors') + \
            int(r.ed_mother_1 == 'bachelors') + \
            int(r.ed_mother_2 == 'bachelors')

    df['n_bach'] = df.apply(count_bach, axis=1)
    df['agecl'] = df.agecl.astype(pd.CategoricalDtype([ '<35', '35-44', '45-54', '55-64', '65-74', '>=75',], ordered=True))
    df['edcl'] = df.edcl.astype(pd.CategoricalDtype(['No HS', 'HS/GED', 'Some College', 'College'], ordered = True))

    # Remap the nwpctlecat category numbers to percentile numbers
    m = dict(zip(list(sorted(df.nwpctlecat.unique())),'0 10 20 30 40 50 60 70 80 90 95 99'.split()))
    df['nwpctlecat'] = pd.to_numeric(df.nwpctlecat.replace(m))

    o, gi_sum_bins = pd.qcut(df[df.gi_value_cd > 0].gi_value_cd, 10 , retbins = True)
    gi_sum_bins[0] = 0 # So zero gets included in a bin
    df['gi_value_cd_decile']  = pd.cut(df.gi_value_cd, gi_sum_bins, labels=False).fillna(0)

    o, gi_sum_bins = pd.qcut(df[df.gi_value_cd > 0].gi_value_cd, 100 , retbins = True, duplicates='drop')
    #gi_sum_bins[0] = 0 # So zero gets included in a bin
    df['gi_value_cd_pctle']  = pd.cut(df.gi_value_cd, gi_sum_bins, labels=False).fillna(0)

    o, gi_sum_bins = pd.qcut(df[df.networth > 0].networth, 100 , retbins = True, duplicates='drop')
    #gi_sum_bins[0] = 0 # So zero gets included in a bin
    df['networth_pctle']  = pd.cut(df.networth, gi_sum_bins, labels=False).fillna(0)

    df90 = df[ (df.networth_pctle <= 90) & (df.nincpctle < 90) ] # Exclude those in top 10pct of gifts. 

    return df,  df90
    
dfbw,  dfs90 =   make_descriptive_df(df) 
   
dfbw.head()

Unnamed: 0,year,case_id,record_id,implicate_id,age_1,age_2,hisp,race,addtional_race,unusual_income,...,networthpc,assetpc,gi_pv_10,gi_pv_7,gi_pv_5,gi_value_cd,n_bach,gi_value_cd_decile,gi_value_cd_pctle,networth_pctle
0,2016,1,11,1,71,0,5,white,5,3,...,202345.699011,202780.801804,0.0,0.0,0.0,0.0,0,0.0,0.0,39.0
1,2016,1,12,2,71,0,5,white,5,3,...,202471.64982,202895.30254,0.0,0.0,0.0,0.0,0,0.0,0.0,39.0
2,2016,1,13,3,71,0,5,white,5,3,...,202357.149085,202780.801804,0.0,0.0,0.0,0.0,0,0.0,0.0,39.0
3,2016,1,14,4,71,0,5,white,5,3,...,202357.149085,202780.801804,0.0,0.0,0.0,0.0,0,0.0,0.0,39.0
4,2016,1,15,5,71,0,5,white,5,3,...,202471.64982,202895.30254,0.0,0.0,0.0,0.0,0,0.0,0.0,39.0


In [47]:

from IPython.display import display

weight_cols = list(e for e in w.columns if e.startswith('wt1'))
import random

def sample(N, wc):
    return df.sample(N,replace=True, weights=w[wc])


In [48]:
def geo_range(start, stop, r):
    i = start
    while i < stop:
        yield int(i)
        i *= r
        
        
list(geo_range(100, 1000, 1.2))

[100, 120, 144, 172, 207, 248, 298, 358, 429, 515, 619, 743, 891]

In [70]:
#series = []
points = [ (N, ncw) for ncw in geo_range(100, 1000, 1.2) for N in geo_range(100_000, 1e6, 1.2)]

from tqdm import tqdm

with open('series.pickle', 'rb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    series = pickle.load(f)

seen = set()
for N, ncw, v in series:
    seen.add ( (N, ncw) )

for N, ncw in tqdm(points):

    if (N, ncw) in seen:
        continue
   
    v = pd.Series([sample(N, wc).networth.median() for wc in random.choices(weight_cols, k=ncw)])

    series.append( ( N, ncw, v) )
    import pickle
    with open('series.pickle', 'wb') as f:
        # Pickle the 'data' dictionary using the highest protocol available.
        pickle.dump(series, f, pickle.HIGHEST_PROTOCOL)
    

100%|██████████| 169/169 [00:00<00:00, 416718.03it/s]


In [71]:
w

Unnamed: 0,year,case_id,record_id,wt1b1,wt1b2,wt1b3,wt1b4,wt1b5,wt1b6,wt1b7,...,wt1b991,wt1b992,wt1b993,wt1b994,wt1b995,wt1b996,wt1b997,wt1b998,wt1b999,yy1
0,2016,1,11,,30675.633028,30501.527264,32288.373838,30781.543684,31745.550789,,...,,,33166.586607,31820.929574,30895.569878,31136.328418,31601.909667,31530.264161,,1
1,2016,1,11,,30675.633028,30501.527264,32288.373838,30781.543684,31745.550789,,...,,,33166.586607,31820.929574,30895.569878,31136.328418,31601.909667,31530.264161,,1
2,2016,1,11,,30675.633028,30501.527264,32288.373838,30781.543684,31745.550789,,...,,,33166.586607,31820.929574,30895.569878,31136.328418,31601.909667,31530.264161,,1
3,2016,1,11,,30675.633028,30501.527264,32288.373838,30781.543684,31745.550789,,...,,,33166.586607,31820.929574,30895.569878,31136.328418,31601.909667,31530.264161,,1
4,2016,1,11,,30675.633028,30501.527264,32288.373838,30781.543684,31745.550789,,...,,,33166.586607,31820.929574,30895.569878,31136.328418,31601.909667,31530.264161,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60120,2019,5813,58131,2881.631412,3122.010728,3634.425238,2802.690049,2458.055375,2735.260725,,...,3473.500564,3524.086922,,,,,,2945.979369,4080.004804,5813
60121,2019,5813,58131,2881.631412,3122.010728,3634.425238,2802.690049,2458.055375,2735.260725,,...,3473.500564,3524.086922,,,,,,2945.979369,4080.004804,5813
60122,2019,5813,58131,2881.631412,3122.010728,3634.425238,2802.690049,2458.055375,2735.260725,,...,3473.500564,3524.086922,,,,,,2945.979369,4080.004804,5813
60123,2019,5813,58131,2881.631412,3122.010728,3634.425238,2802.690049,2458.055375,2735.260725,,...,3473.500564,3524.086922,,,,,,2945.979369,4080.004804,5813
