In [1]:
import pandas as pd
import numpy as np
import metapack as mp
from pathlib import Path
from statsmodels.formula.api import ols
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sdipylib.plot import  source_attribution

# /Users/eric/opt/anaconda3/envs/data/lib/python3.7/site-packages/pandas/plotting/_tools.py:307: MatplotlibDeprecationWarning: 
# The rowNum attribute was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use ax.get_subplotspec().rowspan.start instead.
#   layout[ax.rowNum, ax.colNum] = ax.get_visible()
import warnings
warnings.simplefilter("ignore")

large = 22; med = 16; small = 12
params = {'axes.titlesize': large,
          'legend.fontsize': med,
          'figure.figsize': (16, 10),
          'axes.labelsize': med,
          'axes.titlesize': med,
          'xtick.labelsize': med,
          'ytick.labelsize': med,
          'figure.titlesize': large}
plt.rcParams.update(params)
plt.style.use('seaborn-whitegrid')
sns.set_style("white")
%matplotlib inline

%run weights.py
%run lib.py 

source = "Survey of Consumer Finances, 2016 and 2019 pooled. 2020 Dollars"


# Sampling errors

From the documentation:
    
     An estimate of the total standard error attributable to imputation and sampling is given by SQRT((6/5)*imputation variance + sampling variance).


In [2]:
pkg = multi_open('federalreserve.gov-consumer_finances-2016e2019-inherit', print_ref=True)
print(pkg.ref)
pkg

Opening:  index:federalreserve.gov-consumer_finances-2016e2019-inherit
metapack+file:///Users/eric/proj/data-projects/metatab-packages/inequality-collection/federalreserve.gov-consumer_finances-2016-inherit/_packages/federalreserve.gov-consumer_finances-2016e2019-inherit-1.2.6/metadata.csv


In [3]:
pkg = multi_open('federalreserve.gov-consumer_finances', print_ref=True)
pkg

Opening:  index:federalreserve.gov-consumer_finances


In [4]:
w16 = pkg.reference('scf_weights_16').dataframe()
w16.rename(columns={'Y1':'record_id'}, inplace=True)
w16.insert(0,'year',2016)
w16.insert(1,'case_id',w16.record_id//10)
w16.columns = [e.lower() for e in w16.columns]

w19 = pkg.reference('scf_weights_19').dataframe()
w19.rename(columns={'y1':'record_id'}, inplace=True)
w19.insert(0,'year',2019)
w19.insert(1,'case_id',w19.record_id//10)

w = pd.concat([w16, w19]).sort_values(['year','case_id'])
cols = list(e for e in w.columns if not e.startswith('mm'))
w = w[cols]


In [5]:
pkg = multi_open('federalreserve.gov-consumer_finances-2016e2019-inherit', print_ref=True)
df = pkg.resource('inherit_scf_16_19').dataframe()

# Ensure weights has the same index as df
w = df[['year','case_id']].merge(w, on=['year','case_id'])

df.shape, w.shape

Opening:  index:federalreserve.gov-consumer_finances-2016e2019-inherit


((60125, 92), (60125, 1003))

In [20]:
df[df.year==2019].groupby('race').apply(wmedian, 'networth')

race
black     20730.0
hisp      36180.0
other    193700.0
white    181440.0
dtype: float64

In [19]:
def implicate_agg(df, cat_cols, stat_col, agg_f, ret_impl=False):
    '''Compute a statistic, and the standard error for an aggregation over
    dataset groups, using the implicates'''
    
    if not isinstance(cat_cols, (list, tuple)):
        cat_cols = [cat_cols]
    
    
    impl = df.groupby(cat_cols+['implicate_id']).apply(agg_f, stat_col)\
        .to_frame('stat').unstack()
    
    if ret_impl:
        return impl
    
    impl_se = impl.std(axis=1).to_frame('std')
    stat = impl.mean(axis=1)
    t =  pd.concat([stat, impl_se], axis=1)
    t.columns = ['stat', 'se']
    return t
   
 

stat = implicate_agg(df[df.year==2019], 'race', 'networth', wmedian, ret_impl=True)   
stat

Unnamed: 0_level_0,stat,stat,stat,stat,stat
implicate_id,1,2,3,4,5
race,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
black,21500.0,19700.0,20700.0,19940.0,20910.0
hisp,36050.0,37420.0,37102.0,35350.0,35660.0
other,177630.0,211000.0,193700.0,180900.0,194700.0
white,181200.0,184390.0,179300.0,180530.0,181800.0


In [12]:
def replicate_agg(df, w, cat_cols, stat_col, agg_f, n_replicates=None):
    
    if not isinstance(cat_cols, (list, tuple)):
        cat_cols = [cat_cols]
    
    t = df[['year','case_id',stat_col,'wt0']+cat_cols].merge(w, on=['year','case_id'])
    wt_cols = [c for c in t.columns if c.startswith('wt')]

    if n_replicates:
        wt_cols = wt_cols[:n_replicates]

    f = [t.groupby('race').apply(wmedian, stat_col, wtc).to_frame(wtc) for wtc in wt_cols]
    
    repl = pd.concat(f,axis=1)
    repl_se = repl.std(axis=1)
    repl_se
    
    return repl

repl = replicate_agg(df, w, 'race', 'networth', wmedian, n_replicates=3) 
repl

Unnamed: 0_level_0,wt0,wt1b1,wt1b2
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
black,18880.0,21484.16315,17830.791842
hisp,27014.739803,28769.634318,25900.0
other,158500.0,124863.403657,124180.0
white,178120.0,180170.0,168400.0


In [13]:

def implrepl_agg(df, w, cat_cols, stat_col, agg_f, n_replicates=None):
    
    impl = implicate_agg(df, cat_cols, stat_col, wmedian, ret_impl=True)
    
    repl = replicate_agg(df, w, cat_cols, stat_col, wmedian, n_replicates=n_replicates) 

    stat = pd.concat([impl, repl], axis=1).mean(axis=1)
    se = np.sqrt(6/5)*impl.std(axis=1) + repl.std(axis=1)

    t =  pd.concat([stat, se], axis=1)
    t.columns = ['stat', 'se']
    return t
    
    
%time implrepl_agg(df[df.year==2016], w, 'race', 'networth', wmedian, n_replicates=500) 
    

CPU times: user 9min 29s, sys: 6min 20s, total: 15min 49s
Wall time: 15min 52s


Unnamed: 0_level_0,stat,se
race,Unnamed: 1_level_1,Unnamed: 2_level_1
black,17178.765077,2449.275484
hisp,22337.721097,2914.472078
other,105985.940705,21318.483966
white,170014.535466,7824.842774


In [14]:
    
%time implrepl_agg(df[df.year==2016], w, 'race', 'networth', wmedian) 

    

CPU times: user 18min 46s, sys: 13min 5s, total: 31min 51s
Wall time: 31min 57s


Unnamed: 0_level_0,stat,se
race,Unnamed: 1_level_1,Unnamed: 2_level_1
black,17151.080162,2485.379158
hisp,22329.504278,2885.44037
other,106008.934459,20898.529432
white,169977.597403,7909.27946


In [15]:
    
%time implrepl_agg(df[df.year==2019], w, 'race', 'networth', wmedian) 

 

CPU times: user 17min 8s, sys: 12min 7s, total: 29min 15s
Wall time: 29min 22s


Unnamed: 0_level_0,stat,se
race,Unnamed: 1_level_1,Unnamed: 2_level_1
black,20741.077612,4122.443693
hisp,35395.004975,5399.722792
other,183131.781095,37249.183097
white,178450.483582,7367.481173


In [16]:
    
%time implrepl_agg(df, w, 'race', 'networth', wmedian) 

 

CPU times: user 35min 59s, sys: 25min 4s, total: 1h 1min 4s
Wall time: 1h 1min 11s


Unnamed: 0_level_0,stat,se
race,Unnamed: 1_level_1,Unnamed: 2_level_1
black,18504.042095,2416.606469
hisp,26704.526159,3350.385744
other,148418.565094,23072.671597
white,174556.602504,5900.922416
