In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()

%run ./lib.py

In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg

In [3]:
sipp = pkg.reference('SIPP').package
sipp

In [4]:
columns_by_group, col_label_map, dtype_map, col_set = get_metadata(sipp)


In [5]:

# Get all of the data
df = sipp.reference('pu2018_csv').read_csv(
    sep='|',
    usecols = [ c.upper() for c in list(col_set.column)],
    dtype={c:dtype_map[c.upper()] for c in list(col_set.column) })

In [6]:
df.columns = [c.lower() for c in df.columns]

In [7]:

if False:
    df_rw= pkg.reference('rw2018_csv').read_csv(sep='|')

    #check these estimates against the validation xls file to help ensure that the data
    #    were read in correctly. Note that the validation xls files do not include all variables
    print('REPWT100 mean:' + str(df_rw.REPWGT100.mean()))

    #Merge data and replicate weights on SSUID, PNUM, MONTHCODE
    df = df.merge(df_rw, on=['ssuid','pnum','monthcore'])\
                .rename(columns={'swave_x':'swave', 'spane;_x':'spanel'})
    
 

def raceeth(r):
    if r.eorigin == 1:
        return 'hisp'
    elif r.erace == 1:
        return 'white'
    elif r.erace == 2:
        return 'black'
    elif r.erace == 3:
        return 'asian'
    else:
        return 'other'

df['raceeth'] = df.apply(raceeth,axis=1)
df['dummy'] = 1 # For counting

df['rfamkind'] = df.rfamkind.fillna(0)

# Convert the EEDUC categorical into years of education
df['eeduc_years'] = df.eeduc.replace(eeduc_to_years) # eeduc_to_years is in ./lib.py

# Cache the file for other notebooks. 
df.to_feather('../data/inequality.feather')

In [8]:
for c in pkg.resource('sipp_18').schema_term.find('Table.Column'):
    if not c.value:
        c.description = col_label_map.get(c.value.upper(),'')
    
pkg.write()
pkg = mp.jupyter.open_source_package()
pkg.resource('sipp_18')

Header,Type,Description
ssuid,integer,"Sample unit identifier. This identifier is created by scrambling together PSU, Sequence #1, Sequence #2, and the Frame Indicator for a case. It may be used in matching sample units from different waves."
spanel,integer,Panel year
swave,integer,Wave number of interview
efood1,integer,The food you bought did not last?
efood6,integer,"In 2017, were you ever hungry but didn't eat because there wasn't enough money for food?"
eawbsafe,integer,Is ... neighborhood safe from crime?
pnum,integer,Person number
ems,integer,"Is ... currently married, widowed, divorced, separated, or never married?"
erelrpe,integer,Household relationship (detailed categories)
esex,integer,Sex of this person


In [10]:
df.columns

Index(['ssuid', 'spanel', 'swave', 'efood1', 'efood6', 'eawbsafe', 'pnum',
       'ems', 'erelrpe', 'esex', 'eorigin', 'erace', 'eeduc', 'ebornus',
       'exmar', 'rpar1sex', 'rpar2sex', 'eprocert', 'efood_sr1yn',
       'rlunch_chld', 'rbreak_chld', 'rsnap_yryn', 'ejseddebt', 'eoeddebt',
       'edinrpar', 'rdinrpar', 'rdinrop', 'rgetby', 'eexpsch', 'rexpsch',
       'egifted', 'rlesson', 'empf', 'epnspouse', 'epnpar2', 'epnpar1',
       'epar1typ', 'epar2typ', 'monthcode', 'wpfinwgt', 'eresidenceid',
       'rfamref', 'rfpersons', 'rfrelu18', 'rfamkind', 'rhnumper',
       'rhnum65over', 'rhnum65ovrt2', 'ejb1_wshmwrk', 'ejb1_wsjob',
       'efood_mnyn', 'rsnap_mnyn', 'rfpov', 'rhpov', 'rin_univ', 'tage_fb',
       'tprvlvqrt', 'trace', 'tnum_cbu', 'thhldstatus', 'tjb1_pvtime',
       'toeddebtval', 'tage', 'tage_ehc', 'ttanf_amt', 'tmwkhrs', 'tpearn',
       'tval_home', 'tval_ast', 'tdebt_ast', 'tnetworth', 'thinc_stmf',
       'thval_home', 'thval_ast', 'thdebt_ast', 'thnetworth',