In [1]:

import pandas as pd
import numpy as np
import metapack as mp
from pathlib import Path
from statsmodels.formula.api import ols
import seaborn as sns
import matplotlib.pyplot as plt
from sdipylib.plot import  source_attribution
from IPython.display import HTML, display

# /Users/eric/opt/anaconda3/envs/data/lib/python3.7/site-packages/pandas/plotting/_tools.py:307: MatplotlibDeprecationWarning: 
# The rowNum attribute was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use ax.get_subplotspec().rowspan.start instead.
#   layout[ax.rowNum, ax.colNum] = ax.get_visible()
import warnings
warnings.simplefilter("ignore")

large = 22; med = 16; small = 12
params = {'axes.titlesize': large,
          'legend.fontsize': med,
          'figure.figsize': (16, 10),
          'axes.labelsize': med,
          'axes.titlesize': med,
          'xtick.labelsize': med,
          'ytick.labelsize': med,
          'figure.titlesize': large}
plt.rcParams.update(params)
plt.style.use('seaborn-whitegrid')
sns.set_style("white")
%matplotlib inline

%run weights.py
%run lib.py 

source = "Survey of Consumer Finances, 2016 and 2019 pooled"


# Reweighting


## References

Freedman, David A., and Richard A. Berk. “Weighting Regressions by Propensity Scores.” Evaluation Review 32, no. 4 (August 2008): 392–409. https://doi.org/10.1177/0193841X08317586.


In [2]:
pkg = mp.multi_open('federalreserve.gov-consumer_finances-2016e2019-inherit', print_ref=True)
#pkg = mp.jupyter.open_source_package()
pkg

Opening:  index:federalreserve.gov-consumer_finances-2016e2019-inherit


In [3]:
pkg.reference('scf').package

In [4]:
pkg.resource('inherit_scf_16_19')

Header,Type,Description
year,integer,Year of the SCL reelase the record was drawn from
case_id,integer,
record_id,integer,
implicate_id,integer,"Impicate number, 1 to 5"
age_1,integer,Age of first household adult
age_2,integer,Age of second household adult.
hisp,integer,Is the HH Hispanic
race,string,Simpified race for HH
addtional_race,integer,
unusual_income,integer,


In [5]:
inherit_scf_16_19 = pkg.resource('inherit_scf_16_19').dataframe()

df =   make_descriptive_df(inherit_scf_16_19)    

# Create an ID variable that distinguishes years
df['id'] = df.year.astype(str)+'-'+df.record_id.astype(str)
    

In [6]:
# Group by the values we are normalizing and count them, per race

def reweight_black(df, factors):
    """Create new weights for blacks to match factor groupings of white, using a sampling method.
    This will re-weight blacks to have the same popuation size as whites"""
    # Sample to make handing weights easier. Could probably group and sum
    # the weights
    dfs = df.sample(1_000_000, replace=True, weights=df.wt0)
    
    t = dfs.groupby(factors + ['race']).case_id.count().to_frame().unstack()
    t = t.droplevel(0, axis=1)

    # For the black records, where the number in the category is non zero
    # compute the w:b ratio
    tnz = t[ (t.black!=0)]
    t.loc[(t.black!=0),'ratio'] = tnz.white/tnz.black

    t = t.fillna(0)

    # Merge the black ratios back in. The white ratios are always 1
    x = df[df.race=='black'].merge(t[['ratio']].reset_index(), on=factors)

    z = df.merge(x[['id','ratio']], on='id', how='left')
    z['ratio'] = z.ratio.fillna(1) # White ratio is 1
    z['rw'] = z.wt0 * z.ratio # Create the new weight

    return z
    
def reweight_black_wt(df, factors):
    """Re-weight blacks, but directly use the weights, rather than sampling. This will re-weights blacks
    to have the same factor-group ratio, but retaining the size of the black population"""
    t = df.groupby(factors + ['race']).wt0.sum().to_frame().unstack().droplevel(0, axis=1).fillna(0)
    t.loc[t.black == 0,'black'] = t.loc[t.black == 0,'black'].replace({0:1})
    s = t.sum()

    p = t/s # portion of population per category

    r = (p.white / p.black) # factor to multiply black weights by
    t.loc[:, 'black_adj'] = t.black * r

    t = df[df.race=='black']

    t = t.merge(r.to_frame('black_adj').reset_index(), on=factors )
    df_adj = t[['id','black_adj']].merge(df, on='id', how='right')
    df_adj['black_adj'] = df_adj['black_adj'].fillna(1)
    df_adj['wt0_adj'] = df_adj.wt0 * df_adj.black_adj

    return df_adj
    
def rd(a,b):
    """Relative difference"""
    return np.abs(a-b)/ (a)
    
def wbdiff(df):
    return rd(df.white,df.black).round(3)
    
def rw_report(df1, df2, f):
    
    from IPython.display import HTML

    t1 = f(df1)
    t2 = f(df2)
    
    stats = pd.DataFrame({
        'Unadj': t1, 
        'Adj':t2
    })
    
   
    s1 = stats.Adj - stats.Unadj
    s2 = pd.Series(stats.T.white - stats.T.black, name='Diff')
    
    stats['Diff'] = (s1) 
    stats = stats.append(s2).fillna('')
    
    t = pd.DataFrame([
        [(t1.white - t1.black).round(0).astype(int),  (t1.white/t1.black).round(2), wbdiff(t1)],
        [(t2.white - t2.black).round(0).astype(int),  (t2.white/t2.black).round(2), wbdiff(t2)  ],
    ], columns = ['Diff', 'Ratio', 'Rel Diff'])
   
    s = (t.iloc[1]/t.iloc[0])
    s.name = 3
    t = t.append(s)
    t.index = ['Unadj','Adj','Ratio']
    
    
    o = "<h2>Statistic</h2>"+stats.to_html()+"<h2>Differences</h2>"+t.to_html()
     
    return HTML(o)
 
def run_rw(factors):
    dfrw = reweight_black(df,factors)

    dfs = df.sample(1_000_000, replace=True, weights=df.wt0)
    dfz = dfrw.sample(1_000_000, replace=True, weights=dfrw.rw)
    
    return rw_report(dfs, dfz, lambda df : df.groupby('race').networth.median())

In [27]:
#factors =  [ 'edcl','agecl', 'married','lt_planner','occat1', 'spend_exceeds', 'housecl']
factors =  [ 'edcl','agecl', 'married','lt_planner','occat1', 'spend_exceeds']
factors =  ['edcl','agecl', 'married','lt_planner','occat1', 'spend_exceeds',  'borrow_vacation', 'famstruct',
           'fin_know', 'fin_risk']


def do_adjust(df, factors):
    dfa = reweight_black(df, factors)
    dfa = reweight_black_wt(dfa, factors)


    # Adjustment using weighted Medians
    return pd.concat([
        df.groupby('race').apply(wmedian,'networth', 'wt0' ).to_frame('unadj'),
        dfa.groupby('race').apply(wmedian,'networth', 'wt0_adj' ).to_frame('adj')
    ], axis=1)
    
    
do_adjust(df, ['married', 'agecl', 'edcl', 'occat1'])


Unnamed: 0_level_0,unadj,adj
race,Unnamed: 1_level_1,Unnamed: 2_level_1
black,18880.0,59666.413502
white,178120.0,178120.0


In [8]:
#factors =  [ 'edcl','agecl', 'married','lt_planner','occat1', 'spend_exceeds', 'housecl']
factors =  [ 'edcl','agecl', 'married','lt_planner','occat1', 'spend_exceeds']
factors =  ['edcl','agecl', 'married','lt_planner','occat1', 'spend_exceeds',  'borrow_vacation', 'famstruct',
           'fin_know', 'fin_risk']


dfa = reweight_black(df, factors)
dfa = reweight_black_wt(dfa, factors)


dfs = df.sample(1_000_000, replace=True, weights=df.wt0)
dfz = dfa.sample(1_000_000, replace=True, weights=dfa.wt0_adj)



In [9]:
# Adjustment using sampling. 
rw_report(dfs, dfz, lambda df : df.groupby('race').networth.median())

Unnamed: 0_level_0,Unadj,Adj,Diff
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
black,18920.953586,77374.894515,58453.9
white,178290.0,178150.0,-140.0
Diff,159369.046414,100775.105485,

Unnamed: 0,Diff,Ratio,Rel Diff
Unadj,159369.0,9.42,0.894
Adj,100775.0,2.3,0.566
Ratio,0.632338,0.244161,0.63311


In [10]:
rw_report(dfs, dfz, lambda df : df.groupby('race').norminc.median())

Unnamed: 0_level_0,Unadj,Adj,Diff
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
black,40724.716063,61087.074094,20362.4
white,67195.781504,67195.781504,0.0
Diff,26471.065441,6108.707409,

Unnamed: 0,Diff,Ratio,Rel Diff
Unadj,26471.0,1.65,0.394
Adj,6109.0,1.1,0.091
Ratio,0.230781,0.666667,0.230964


In [11]:
dfs.revpay

259        0.000000
8803       0.000000
38656    462.500000
42634      0.000000
25810     13.294655
            ...    
5919       0.000000
9819     850.857947
44645     92.500000
58345      0.000000
9729       0.000000
Name: revpay, Length: 1000000, dtype: float64

In [12]:
target = 'networth'

def reweighted_mean(dfz):
    weights = dfz.groupby(['race',f]).case_id.count().unstack()
    return (t2*weights).sum(axis=1)/weights.sum(axis=1)

factors =  ['edcl','agecl', 'married','lt_planner','occat1', 'spend_exceeds',  'borrow_vacation', 'famstruct',
           'fin_know', 'fin_risk']

dfs = df.sample(100_000, replace=True, weights=df.wt0)

for f in factors:
    
    dfrw = reweight_black(df,[f])
    dfz = dfrw.sample(100_000, replace=True, weights=dfrw.rw)
    
    t1 = dfs.groupby('race').networth.median()
    t2 = dfz.groupby('race').networth.median()
    d1 = t1.white - t1.black
    d2 = t2.white - t2.black
    
    print( f, (d1-d2)/d1 )

    

edcl 0.0644674544130353
agecl 0.07814666873951999
married 0.10085780859897858
lt_planner 0.04377247672641735
occat1 0.026272888065201457
spend_exceeds 0.06752191848092463
borrow_vacation 0.006171145379371087
famstruct 0.13683903913861944
fin_know 0.04196420347855041
fin_risk 0.009899064081422984


In [13]:
run_rw(factors)

Unnamed: 0_level_0,Unadj,Adj,Diff
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
black,18857.139241,77374.894515,58517.8
white,177821.0,177776.132208,-44.8678
Diff,158963.860759,100401.237693,

Unnamed: 0,Diff,Ratio,Rel Diff
Unadj,158964.0,9.43,0.894
Adj,100401.0,2.3,0.565
Ratio,0.631596,0.243902,0.631991


In [14]:
rw_report(dfs, dfz, lambda df : df.groupby('race').norminc.mean())

Unnamed: 0_level_0,Unadj,Adj,Diff
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
black,58477.407565,60272.360204,1794.95
white,120723.935648,121624.945783,901.01
Diff,62246.528084,61352.585579,

Unnamed: 0,Diff,Ratio,Rel Diff
Unadj,62247.0,2.06,0.516
Adj,61353.0,2.02,0.504
Ratio,0.985638,0.980583,0.976744


In [15]:
dfrw = reweight_black(df, [ 'nincpctlecat'])

dfs = df.sample(1_000_000, replace=True, weights=df.wt0)
dfz = dfrw.sample(1_000_000, replace=True, weights=dfrw.rw)

t = rw_report(dfs, dfz, lambda df : df.groupby('race').networth.median())
t

Unnamed: 0_level_0,Unadj,Adj,Diff
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
black,18880.0,57188.289733,38308.3
white,178480.0,179318.312236,838.312
Diff,159600.0,122130.022504,

Unnamed: 0,Diff,Ratio,Rel Diff
Unadj,159600.0,9.45,0.894
Adj,122130.0,3.14,0.681
Ratio,0.765226,0.332275,0.761745


In [16]:
dfs = df.sample(1_000_000, replace=True, weights=df.wt0)

In [17]:
t = dfs[dfs.housecl == 1]
dfs.groupby(['nincpctle', 'race']).homeeq.median().unstack().astype(int)#.plot()

race,black,white
nincpctle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,0
10,0,11699
20,0,29780
30,0,35000
40,1000,50000
50,7000,52000
60,22000,75000
70,28716,81000
80,60623,128692
90,125000,224000


In [18]:
rw_report(dfs, dfz, lambda df : df.groupby('race').homeeq.mean())

Unnamed: 0_level_0,Unadj,Adj,Diff
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
black,49172.021026,82900.177008,33728.2
white,168682.898024,168137.052268,-545.846
Diff,119510.876999,85236.87526,

Unnamed: 0,Diff,Ratio,Rel Diff
Unadj,119511.0,3.43,0.708
Adj,85237.0,2.03,0.507
Ratio,0.713215,0.591837,0.716102


# Characterize Factor Groups

In [19]:
dfs = df.sample(500_000, replace=True, weights=df.wt0)
factors = [ 'edcl','agecl', 'married','lt_planner','occat1', 'spend_exceeds', 'housecl']
t = dfs.groupby(factors + ['race']).case_id.count().to_frame().unstack()
t = t.droplevel(0, axis=1)
tnz = t[ (t.black!=0)]
tnz

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,race,black,white
edcl,agecl,married,lt_planner,occat1,spend_exceeds,housecl,Unnamed: 7_level_1,Unnamed: 8_level_1
No HS,<35,1,0,1,1,2,157,145
No HS,<35,1,0,1,2,2,61,314
No HS,<35,1,0,1,3,2,47,108
No HS,<35,1,0,4,3,2,108,0
No HS,<35,1,1,1,2,2,113,326
...,...,...,...,...,...,...,...,...
College,>=75,2,0,3,3,2,26,51
College,>=75,2,1,1,2,2,92,0
College,>=75,2,1,3,2,1,18,1133
College,>=75,2,1,3,3,1,237,1656


In [20]:
dfs = df.sample(500_000, replace=True, weights=df.wt0)
factors = [ 'edcl','agecl', 'married','lt_planner','occat1', 'spend_exceeds', 'housecl']
t = dfs.groupby(factors + ['race']).wt0.agg(['mean','std']).unstack()
t = t.dropna()
t

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,mean,mean,std,std
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,race,black,white,black,white
edcl,agecl,married,lt_planner,occat1,spend_exceeds,housecl,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
No HS,<35,1,0,1,1,2,25028.821300,21626.405905,4296.148307,2968.914129
No HS,<35,1,0,1,2,2,18793.251746,27671.215011,642.250375,5719.238335
No HS,<35,1,0,1,3,2,22716.479689,27623.512126,50.665965,5333.387317
No HS,<35,1,1,1,2,2,27559.654115,26732.829307,3890.986694,5573.349371
No HS,<35,1,1,4,1,2,20382.166991,15228.564670,0.000000,37.273419
...,...,...,...,...,...,...,...,...,...,...
College,>=75,2,0,3,3,1,31874.603176,33174.870090,7805.605634,13994.003200
College,>=75,2,0,3,3,2,15647.757599,18092.785659,111.185001,8586.119084
College,>=75,2,1,3,2,1,7267.517539,45954.950855,138.079145,15555.899596
College,>=75,2,1,3,3,1,38572.449472,29500.392800,11527.358806,10667.911582


In [21]:
t.loc[:,'mean'].divide(t.loc[:,'std']).sort_values('white', ascending=False).replace({np.inf:np.nan}).dropna()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,race,black,white
edcl,agecl,married,lt_planner,occat1,spend_exceeds,housecl,Unnamed: 7_level_1,Unnamed: 8_level_1
No HS,65-74,2,0,3,1,1,243.131108,4687.872265
No HS,65-74,2,1,3,1,1,661.498216,3410.810130
No HS,35-44,2,1,4,2,2,306.063962,2002.585239
HS/GED,45-54,2,1,4,3,1,237.284691,1962.482435
Some College,35-44,2,0,3,3,2,4.612466,1912.148433
...,...,...,...,...,...,...,...,...
College,>=75,1,1,2,3,1,2.428884,1.769046
HS/GED,65-74,1,1,2,3,1,2.452188,1.675448
College,65-74,1,1,2,3,1,2.218738,1.428567
College,45-54,1,1,2,3,1,2.092556,1.374332


In [22]:
tx = t[(t.black < 1e6) & (t.white < 1e6)]
print(t.corr())

sns.scatterplot(t.black, t.white)

AttributeError: 'DataFrame' object has no attribute 'black'

In [None]:
(tx.white - tx.black).describe()

In [None]:
tx = t[(t.black > t.white)]
sns.scatterplot(tx.black, tx.white)