In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()


In [12]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg

In [4]:
sipp = pkg.reference('SIPP').package
sipp

In [5]:
'''
The following code is an example of reading pipe-delimited Survey of Income and Program Participation (SIPP) 
    data into a Pandas dataframe. Specifically, this code loads and merges the primary dataset and the 
    calendar-year replicate weights (as opposed to the longitudinal replicate weights) in preparation for 
    analysis.
This code is written for Python 3, and requires version 0.24 or higher of the Pandas module. 
Note the use of 'usecols' in the first pd.read_csv statement. Most machines do not have enough memory to read
    the entire SIPP file into memory. Use 'usecols' to read in only the columns you are interested in. If you
    still encounter an out-of-memory error, either select less columns, or consider using the Dask module.
Run this code in the same directory as the data.
This code was written by Adam Smith. Please report errors to <census.sipp@census.gov>.
'''

#Read in the primary data file schema to get data-type information for
#    each variable.
rd_schema = pd.read_json(sipp.reference('pu2018_schema').resolved_url.get_resource().read())

#Read in the replicate weight data file schema to get data-type information 
#    for each variable.
rw_schema = pd.read_json(sipp.reference('rw2018_schema').resolved_url.get_resource().read())


#Define Pandas data types based on the schema data-type information for both schema dataframes
rd_schema['dtype'] = ['Int64' if x == 'integer' \
            else 'object' if x == 'string' \
            else 'Float64' if x == 'float' \
            else 'ERROR' \
            for x in rd_schema['dtype']]

rw_schema['dtype'] = ['Int64' if x == 'integer' \
            else 'object' if x == 'string' \
            else 'Float64' if x == 'float' \
            else 'ERROR' \
            for x in rw_schema['dtype']]

In [8]:

# Get labels for columns with 
# dict(zip(rd_schema['name'], rd_schema['label']))

col_label_map = dict(zip(rd_schema['name'], rd_schema['label']))
dtype_map = dict(zip(rd_schema['name'], rd_schema['dtype']))

use_cols = [
    'ssuid', # Sample unit identifier. This identifier is created by scrambling together PSU, Sequence #1, Sequence #2, and the Frame Indicator for a case. It may be used in matching sample units from different waves.
    'pnum', # Person number
    'monthcode', # Value of reference month
    'eresidenceid', # This field stores a unique six-digit identifier for residence addresses.
    'erelrpe', # Household relationship (detailed categories)
    'spanel', # Panel year
    'swave', # Wave number of interview
    'wpfinwgt', # Final person weight
    'thhldstatus', # Household status
    'rin_univ', # Monthly indicator that respondent is in survey frame universe
    'esex', # Sex of this person
    'tage', # Age as of last birthday
    'tage_ehc', # Monthly age during the reference year.
    'erace', # What race(s) does ... consider herself/himself to be?
    'trace', # What race(s) does ... consider herself/himself to be? (detailed categories)
    'eorigin', # Is ... Spanish, Hispanic, or Latino?
    'eeduc', # What is the highest level of school ... completed or the highest degree received by December of (reference year)?
    'ems', # Is ... currently married, widowed, divorced, separated, or never married?
    'ebornus', # Where was ... born?
    'eawbsafe', # Is ... neighborhood safe from crime?
    'edinrpar', # How many days in a typical week did reference parent eat dinner with child/children?
    'eexpsch', # Child/children ever been suspended or expelled from school?
    'rexpsch', # Child was expelled
    'egifted', # Child/children in gifted class/classes.
    'rlesson', # Child takes lessons
    'rgetby', # Child does just enough to get by
    'efood1', # The food you bought did not last?
    'efood6', # In 2017, were you ever hungry but didn't eat because there wasn't enough money for food?
    'rbreak_chld', # Did child usually get the school breakfast that his/her school provided?
    'rlunch_chld', # Did child usually get the school lunch that his/her school provided?
    'efood_mnyn', # Did ... receive food assistance (not SNAP) this month (1-12)?
    'efood_sr1yn', # Did ... receive assistance from a government social service agency?
    'rdinrpar', # Number of times reference parent had dinner with child 0-17.
    'rdinrop', # Number of times other parent had dinner with child 0-17.
    'rsnap_mnyn', # Received SNAP benefits this month
    'rsnap_yryn', # Received SNAP benefits in at least one month of the reference period
    'ttanf_amt', # Value of the TANF benefits received this month
    'rfpov', # Family poverty threshold in this month, excluding Type 2 individuals
    'rhpov', # Household poverty threshold in this month, excluding Type 2 individuals
    'tfcyincpov', # Family income-to-poverty ratio in this calendar year, excluding Type 2 individuals
    'thcyincpov', # Household income-to-poverty ratio in this calendar year, excluding Type 2 individuals
    'empf', # Do you have children with more than one partner?
    'tage_fb', # Age at first birth
    'tnum_cbu', # Number of child bearing unions
    'exmar', # Number of times married
    'tprvlvqrt', # Type of living quarters for the residence.
    'tpearn', # Sum of earnings and profits/losses from all jobs, varying with the number of days in the month (comparable to RPEARN in 1996, 2001, and 2014 SIPP panels).
    'tmwkhrs', # Average number of hours worked per week at all jobs held during the reference month.
    'tptotinc', # Sum of personal monthly earnings and income for people age 15 and older, as well as children under age 15 who received SSI payments
    'tptrninc', # Sum of monthly income received from means-tested transfer programs (including SSI, TANF, GA, and the Veterans Pension program)
    'thnetworth', # Household-level net worth [this is household-level data, therefore this value is copied to every member of the household].
    'thval_ast', # Household-level sum of all asset values (TVAL_AST) [this is household-level data, therefore this value is copied to every member of the household].
    'thval_home', # Household-level sum of value of primary residence (TVAL_HOME) [this is household-level data, therefore this value is copied to every member of the household].
    'thdebt_ast', # Household-level sum of all debt (TDEBT_AST) [this is household-level data, therefore this value is copied to every member of the household].
    'tdebt_ast', # Person-level sum of all debt (TDEBT_SEC, TDEBT_USEC).
    'thinc_stmf', # Household-level sum of income earned over the reference period from stocks and mutual funds (TINC_STMF) [this is household-level data, therefore this value is copied to every member of the household].
    'eoeddebt', # Owed any money for student loans or educational expenses in own name only during the reference period.
    'toeddebtval', # Amount of student loans or educational expenses owed in own name only as of the last day of the reference period.
    'ejseddebt', # Owed any money for student loans or educational expenses jointly with a spouse or civil union partner during the reference period.
    'eprocert', # Has...earned a professional certification or license?
    'ejb1_wshmwrk', # Were there any days when ... worked only from home?
    'ejb1_wsjob', # What is the best description of ... work schedule?
    'tjb1_pvtime', # What is ... one-way travel to work in minutes?
    'rfamkind', # Kind of family
    'rfamref', # Person number of the family reference person
    'rhnumper', # Number of persons in household this month
    'rfrelu18', # Number of persons in family under 18 years
    'rhnum65over', # Number of persons in household 65 years and over this month
    'rhnum65ovrt2', # Number of persons in household 65 years and over this month (with Type 2 persons)
    'rfpersons', # Number of persons in family
    'rfrelu18', # Number of persons in family under 18 years
    'rpar1sex', # Parent 1 sex at interview month
    'rpar2sex', # Parent 2 sex at interview month
    'ems', # Is ... currently married, widowed, divorced, separated, or never married?
    'epnspouse', # Person number of spouse
    'epar1typ', # Type of relationship to parent 1
    'epar2typ', # Type of relationship to parent 2
    'epnpar1', # Person number of parent 1
    'epnpar2', # Person number of parent 2

    
]
# Uncomment this to re-generate the variables list with descriptive comments
if False:
    for c in use_cols:
        print(f"    '{c.lower()}', # {col_label_map[c.upper()]}")


In [10]:
%%time
df_data = sipp.reference('pu2018_csv').read_csv(
    sep='|',
    usecols = [ c.upper() for c in use_cols],
    dtype={c:dtype_map[c.upper()] for c in use_cols })

df_data.columns = [c.lower() for c in df_data.columns]


CPU times: user 57.9 s, sys: 1min 21s, total: 2min 19s
Wall time: 3min 3s


In [11]:

if False:
    df_rw_2 = pkg.reference('rw2018_csv').read_csv(sep='|')

    #check these estimates against the validation xls file to help ensure that the data
    #    were read in correctly. Note that the validation xls files do not include all variables
    print('REPWT100 mean:' + str(df_rw.REPWGT100.mean()))

    #Merge data and replicate weights on SSUID, PNUM, MONTHCODE
    df = df_data.merge(df_rw, on=['ssuid','pnum','monthcore'])\
                .rename(columns={'swave_x':'swave', 'spane;_x':'spanel'})
    
else:
    df = df_data
    


def raceeth(r):
    if r.eorigin == 1:
        return 'hisp'
    elif r.erace == 1:
        return 'white'
    elif r.erace == 2:
        return 'black'
    elif r.erace == 3:
        return 'asian'
    else:
        return 'other'

df['raceeth'] = df.apply(raceeth,axis=1)
#df['dummy'] = 1 # For counting

In [22]:
for c in pkg.resource('sipp_18').schema_term.find('Table.Column'):
    if not c.value:
        c.description = col_label_map.get(c.value.upper(),'')
    
pkg.write()
pkg = mp.jupyter.open_source_package()
pkg.resource('sipp_18')

Header,Type,Description
ssuid,integer,"Sample unit identifier. This identifier is created by scrambling together PSU, Sequence #1, Sequence #2, and the Frame Indicator for a case. It may be used in matching sample units from different waves."
spanel,integer,Panel year
swave,integer,Wave number of interview
efood1,integer,The food you bought did not last?
efood6,integer,"In 2017, were you ever hungry but didn't eat because there wasn't enough money for food?"
eawbsafe,integer,Is ... neighborhood safe from crime?
pnum,integer,Person number
ems,integer,"Is ... currently married, widowed, divorced, separated, or never married?"
erelrpe,integer,Household relationship (detailed categories)
esex,integer,Sex of this person
