# Survey of Consumer Finances, Data Extraction

This notebook will get the links to the SCL 2016 files from a Metatab package and build
a dataset we can use for analysis.


In [3]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 
from pathlib import Path

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()


In [4]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg

In [29]:
scf_pkg = pkg.reference('scf').resolved_url.doc
scf_pkg

Unnamed: 0,j7398,j7578,j7579,j7019,j7020,j7001,j7050,j8020,j8021,j5908,...,j6768,x42000,x42001,j306,j11272,j11572,j33001,x306,x11272,x11572
0,0,0,0,8,8,0,0,0,0,0,...,1,30609.361546,30598.896539,,,,0,,,
1,0,0,0,8,8,0,0,0,0,0,...,1,23575.752202,23561.874562,,,,0,,,
2,0,0,0,8,8,0,0,0,0,0,...,1,25762.246741,25726.122276,,,,0,,,
3,0,0,0,8,8,0,0,0,0,0,...,1,26511.942367,26488.317060,,,,0,,,
4,0,0,0,8,8,0,0,0,0,0,...,1,23825.934782,23809.061856,,,,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28880,0,0,0,8,8,0,0,0,0,0,...,8,3335.499439,3335.491502,,,,0,,,
28881,0,0,0,8,8,0,0,0,0,0,...,8,3394.075511,3394.109280,,,,0,,,
28882,0,0,0,8,8,0,0,0,0,0,...,8,3204.468516,3204.540710,,,,0,,,
28883,0,0,0,8,8,0,0,0,0,0,...,8,3325.732483,3325.760360,,,,0,,,


In [51]:
mm_cols = [ f"MM{i}" for i in range(1,1000) ]
wt_cols = [ f"WT1B{i}" for i in range(1,1000)]

def extract(scf, scfe):

    scf.columns = [c.lower() for c in scf.columns]
    scfe.columns = [c.lower() for c in scfe.columns]
    
    # Extract and rename a set of columns we are interested in 
    cols = {
        'case_id': 'yy1',
        'record_id': 'y1',
        'age_1': 'x14', # Reconciled age
        'age_2': 'x19', # Reconciled age
        'hisp': 'x7004', # Do you consider yourself to be Hispanic or Latino in culture or origin?
        'race': 'x6809', # Race of respondent
        'addtional_race': 'x6810', # Respondent offered another race categot (1) or did not (5)
        # X6402 #In 2015, did (other adult) receive any income from wages or salaries?
        #'income': 'X5729', # How much was the total income you (and your family living here) received in 2015 from all sources, before taxes and other deductions were made?
        'unusual_income': 'x7650', # Is this income unusually high or low...
        'ed_1': 'x5931', # What is the highest level of school completed or the highest degree you have received?
        'ed_2': 'x6111', # What is the highest level of school completed or the highest degree you have received?
        'ed_mother_1': 'x6032', # What is the highest level of school or the highest degree mother completed?
        'ed_father_1': 'x6033', # What is the highest level of school or the highest degree father completed?
        'ed_mother_2': 'x6132', # What is the highest level of school or the highest degree mother completed?
        'ed_father_2': 'x6133', # What is the highest level of school or the highest degree father completed?  
        'occ_1': 'x7401', # What is the official title of your job?
        'occ_2': 'x7411', # What is the official title of your job?
        'gi_other_value': 'x5818', #How much altogether were any others (inheritances) you have received?
        'fin_risk': 'x7557', # Willingness to take fnancial risks, 1 to 10
        'shop_credit' : 'x7561', # Financial shopping
        'shop_credit_2' : 'x7562', # Financial Shopping
        'fin_know': 'x7556', # Financial Knowledge
        'borrow_vacation': 'x402', # Borrow for a vacation
        'plan_horizon': 'x3008', # which of the time periods listed on this page is most important to you
        'spend_exceeds':'x7510', # would you say that your (family's) spending exceeded your (family's) income,
        'spend_exceeds_excl_house':'x7508', # Spending exceeds, after purchase of house. 
        'wt0': 'x42001' # Weight

    }


    # There are three ( four actually) set os variables for gifts and transfers, 
    # We'll ignore the fourth, because it is rare. 

    gi_cols_packed = {
        'gi_type':  'x5803 x5808 x5813'.split(), # Was that an inheritance, a trust, or something else?
        'gi_value': 'x5804 x5809 x5814'.split(), # What was its approximate value at the time it was received?
        'gi_year':  'x5805 x5810 x5815'.split(), # In what year was it received?
        'gi_from':  'x5806 x5811 x5816'.split(), # From whom was it received?
    }

    gi_cols = {}
    
    for i in range(1,4):
        for k, c in gi_cols_packed.items():
            cols[f"{k}_{i}"] = c[i-1]

    
    df =  scf[cols.values()].rename(columns={v.lower():k for k,v in cols.items()})
    
    # make the implicate number
    df.insert(2, 'implicate_id', df.record_id - df.case_id.astype('int32')*10)   

    # Extract some variables from the public extract file. These variables
    # already have sensible names. 
    scf_ext = scfe[['y1','networth',  'income', 'nwcat', 'nwpctlecat', 
                    'norminc', 'ninccat',  'ninc2cat', 'nincpctlecat',
                    'occat1', 'occat2', 'edcl', 'lifecl', 'famstruct', 
                    'married',  'agecl', 'housecl', 'racecl','racecl4',
                    'asset', 'liq','bond', 'fin','nfin', 'debt', 'indcat', 
                    'equity','homeeq', 'revpay','bnkruplast5', 'debt2inc', 'hsaving' , 'saved'
                   ]].rename(columns={'y1':'record_id'})

    df = df.merge(scf_ext, on='record_id')

    return df

def munge(df):
    
    # Create more meaningful categoricals for some variables. 

    df['race'] = df['race'].astype('category').cat.rename_categories(
        {-7: 'other', 1: 'white', 2: 'black', 3: 'hisp', 4: 'asian', 5: 'aian', 6: 'nhpi'})

    for c in "gi_type_1 gi_type_2 gi_type_3".split():
        df[c] = df[c].astype('category').cat.rename_categories(
            {0: 'na', 1: 'inheritance', 2: 'trust', 3: 'gift'})

    bc = pd.CategoricalDtype(ordered=True) # Shortcut for assigning this type

    # Create categories for the educational attainments
    for c in "ed_1 ed_2 ed_mother_1 ed_mother_2 ed_father_1 ed_father_2".split():
        df[c] = df[c].astype(bc).cat.rename_categories(
            {-1: 'lt_grade_1', 0: 'na', 1: 'grade_1_4', 2: 'grade_5_6', 3: 'grade_7_8', 
             4: 'grade_9', 5: 'grade_10', 6: 'grade_11', 7: 'grade_12', 8: 'hs', 
             9: 'some_college', 10: 'assoc_vocational', 11: 'assoc_academic', 
             12: 'bachelors', 13: 'masters', 14: 'advanced'})

    # The lesser education of the household partners. 
    df['education'] = df[['ed_1','ed_2']].replace({0:7}).min(axis=1)

    # The lesser occupation of the household partners. 
    df['occ'] = df[['occ_1','occ_2']].replace({0:7}).min(axis=1)


    # Long-term planners, for plan_horizon/X3008 answered "next few years" or longer
    df['lt_planner'] = df.plan_horizon.isin([3,4,5]).astype(int)

    df['agecl'] = df.agecl.astype('category').cat.rename_categories(
        {1:'<35', 2:'35-44', 3:'45-54', 4:'55-64', 5:'65-74', 6:'>=75'})
    df['edcl'] = df.edcl.astype('category').cat.rename_categories(
        {1:'No HS', 2:'HS/GED', 3:'Some College', 4:'College'})

    # Income and networth percentiles
    df['nincpctle'] = df.nincpctlecat.astype(bc).cat.rename_categories(
        [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99])
    df['ninc'] = df.ninccat.astype(bc).cat.rename_categories(
        [0,20,40,60,80,90])
    df['nwpctle'] = df.nwpctlecat.astype(bc).cat.rename_categories(
        [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99])

    return df

def munge_gt(df):
    # Create summary variables for the number of each type of gift/transfer. This part is expensive. 
    t = df[['case_id','record_id',
            'gi_type_1', 'gi_type_2', 'gi_type_3',
            'gi_value_1', 'gi_value_2', 'gi_value_3',
            'gi_year_1', 'gi_year_2', 'gi_year_3'
           ]].set_index(['case_id','record_id'])

    def n_inherit_types(r):
        return r[['gi_type_1', 'gi_type_2', 'gi_type_3']].value_counts()

    nit = df.apply(n_inherit_types, axis=1).fillna(0)

    # Sum up all of the gifts/transfers. 
    df['gi_sum'] = df[['gi_value_1', 'gi_value_2', 'gi_value_3']].sum(axis=1)
    df['gi_sum'] = df.gi_sum.mask(df.gi_sum<0, 0) # Remove -9 and -1 values 

    df['n_gift'] = nit.gift.astype(int)
    df['n_inherit'] = nit.inheritance.astype(int)
    df['n_trust'] = nit.trust.astype(int)
    df['n_transfer'] = df['n_gift'] + df['n_inherit'] + df['n_trust']

    df['any_transfer'] = (df.n_transfer > 0).astype(int)
    df['any_inherit'] = (df.n_inherit > 0).astype(int)

    # Household size and per-capital
    df['hhsize'] = df.married.apply( lambda v: 1 if v ==2 else 2)
    df['networthpc'] = df.networth / df.hhsize
    df['assetpc'] = df.asset / df.hhsize

    return df


CPU times: user 25.8 s, sys: 177 ms, total: 26 s
Wall time: 26 s


In [52]:
# Main variables
scf = scf_pkg.reference('scf_public_16').dataframe()

# Extract file. The variables in this file are created by a SAS macro, which is
# the best (only?) documentation for the meaning of the variable
# https://www.federalreserve.gov/econres/files/bulletin.macro.txt

scfe = scf_pkg.reference('scf_extract_16').dataframe()

%time df16 = munge_gt(munge(extract(scf, scfe)))

CPU times: user 27.5 s, sys: 193 ms, total: 27.7 s
Wall time: 27.7 s


In [53]:
# Main variables
scf = scf_pkg.reference('scf_public_19').dataframe()

# Extract file. The variables in this file are created by a SAS macro, which is
# the best (only?) documentation for the meaning of the variable
# https://www.federalreserve.gov/econres/files/bulletin.macro.txt

scfe = scf_pkg.reference('scf_extract_19').dataframe()

%time df19 = munge_gt(munge(extract(scf, scfe)))


CPU times: user 25.2 s, sys: 136 ms, total: 25.3 s
Wall time: 25.3 s


In [61]:
df16.insert(0,'year',2016)
df19.insert(0,'year',2019)
df = pd.concat([df16, df19])

df.to_csv('../data/inherit_scf_16_19.csv', index=False) 

Unnamed: 0,year,case_id,record_id,implicate_id,age_1,age_2,hisp,race,addtional_race,unusual_income,...,gi_sum,n_gift,n_inherit,n_trust,n_transfer,any_transfer,any_inherit,hhsize,networthpc,assetpc
0,2016,1,11,1,71,0,5,white,5,3,...,0,0,0,0,0,0,0,1,1.879545e+05,1.883587e+05
1,2016,1,12,2,71,0,5,white,5,3,...,0,0,0,0,0,0,0,1,1.880715e+05,1.884650e+05
2,2016,1,13,3,71,0,5,white,5,3,...,0,0,0,0,0,0,0,1,1.879652e+05,1.883587e+05
3,2016,1,14,4,71,0,5,white,5,3,...,0,0,0,0,0,0,0,1,1.879652e+05,1.883587e+05
4,2016,1,15,5,71,0,5,white,5,3,...,0,0,0,0,0,0,0,1,1.880715e+05,1.884650e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28880,2019,5813,58131,1,51,55,5,white,5,3,...,0,0,0,0,0,0,0,2,3.817600e+06,3.839600e+06
28881,2019,5813,58132,2,51,55,5,white,5,3,...,0,0,0,0,0,0,0,2,3.864400e+06,3.886900e+06
28882,2019,5813,58133,3,51,55,5,white,5,3,...,0,0,0,0,0,0,0,2,3.742900e+06,3.765900e+06
28883,2019,5813,58134,4,51,55,5,white,5,3,...,0,0,0,0,0,0,0,2,3.997750e+06,4.024750e+06


In [57]:
cols = ['age_1','race','hisp','norminc', 'gi_sum','wt0']
df16[cols].describe()

Unnamed: 0,age_1,hisp,norminc,gi_sum,wt0
count,31240.0,31240.0,31240.0,31240.0,31240.0
mean,52.703585,4.519718,810447.1,313228.1,20163.524437
std,16.214756,1.300196,5614844.0,5928944.0,10859.38267
min,18.0,1.0,0.0,0.0,13.006523
25%,40.0,5.0,37705.15,0.0,15702.914143
50%,54.0,5.0,75410.3,0.0,21273.066201
75%,64.0,5.0,175598.3,0.0,26557.365912
max,95.0,5.0,325298500.0,334630000.0,63996.349244


In [58]:
df19[cols].describe()

Unnamed: 0,age_1,hisp,norminc,gi_sum,wt0
count,28885.0,28885.0,28885.0,28885.0,28885.0
mean,53.219837,4.527367,883189.6,229397.6,22268.033748
std,16.243653,1.291204,9893051.0,2431933.0,14075.230385
min,18.0,1.0,0.0,0.0,7.212384
25%,40.0,5.0,38688.48,0.0,10492.99978
50%,54.0,5.0,81449.43,0.0,23808.975167
75%,65.0,5.0,197514.9,7500.0,30895.566734
max,95.0,5.0,703590700.0,88010000.0,74136.603222


31240