In [8]:
##
# File: sipp.ipynb
# Purpose: Data analysis for the 2020 Census SIPP (Survey of Income and Program Participation).
#	Can read the entire dataset csv (4GB) and extract relevant columns. Relevant columns (those relating to retirement
#	accounts and identifiers) are saved to a smaller csv for analysis.
#	Also runs calculations on relevant files and saves them for use in viz.

import pandas as pd
import numpy as np

# Snippets from census.gov data usage guide
rd_schema = pd.read_json('data/pu2021_schema.json')

rd_schema['dtype'] = ['Int64' if x == 'integer' \
			else 'object' if x == 'string' \
			else 'Float64' if x == 'float' \
			else 'ERROR' \
			for x in rd_schema['dtype']]

In [9]:
# USE THIS CELL TO EXTRACT COLUMNS FROM WHOLE DATASET -- DONT USE FOR CALCULATIONS
# I (Amy) am not committing the full dataset to Git. If you need more columns, let me know and I will extract them.

# Snippet from census.gov
df_data = pd.read_csv("data/pu2021.csv",\
		names=rd_schema['name'],\
		#dtype expects a dictionary of key:values
		dtype = dict([(i,v) for i,v in zip(rd_schema['name'], rd_schema['dtype'])]),\
		#files are pipe-delimited
		sep='|',\
		header=0,\
		#Add variables for analysis here. If you receive an out-of-memory error,
		#	either select less columns, or consider using the Dask module
		usecols = [
		#Common record-identification variables
		'SSUID','PNUM','MONTHCODE','ERESIDENCEID','ERELRPE','SPANEL','SWAVE',\
		#Common demographics variables, including age at time of interview (TAGE)
		#	and monthly age during the reference period (TAGE_EHC)
		'ESEX','TAGE','TAGE_EHC','ERACE','EORIGIN','EEDUC',\
		#Additional variables for analysis
		'EOWN_IRAKEO', 'EOWN_THR401','EOWN_SAV','EOWN_CD','EOWN_ST'
			]
		)
#preview the data		
print(df_data.head())
df_data.to_csv('data/sipp_data.csv') # Smaller csv that we can work with more easily

            SSUID  SPANEL  SWAVE  PNUM  ERELRPE  ESEX  EORIGIN  ERACE  EEDUC  \
0  00011428507021    2021      1   101        2     1        2      2     42   
1  00011428507021    2021      1   101        2     1        2      2     42   
2  00011428507021    2021      1   101        2     1        2      2     42   
3  00011428507021    2021      1   101        2     1        2      2     42   
4  00011428507021    2021      1   101        2     1        2      2     42   

   EOWN_SAV  EOWN_CD  EOWN_ST  EOWN_IRAKEO  EOWN_THR401  MONTHCODE  \
0         1        2        2            2            2          1   
1         1        2        2            2            2          2   
2         1        2        2            2            2          3   
3         1        2        2            2            2          4   
4         1        2        2            2            2          5   

  ERESIDENCEID  TAGE  TAGE_EHC  
0       100001    32        30  
1       100001    32        30  

In [34]:
# USE THIS CELL WHEN CALCULATING
data = pd.read_csv('data/sipp_data.csv')
unique_data = data.drop_duplicates(subset=["SSUID"])

#print(unique_data.head())

num_IRA = data[data['EOWN_IRAKEO'] == 1].count()['EOWN_IRAKEO']
num_401 = data[data['EOWN_THR401'] == 1].count()['EOWN_THR401']
num_SAV = data[data['EOWN_SAV'] == 1].count()['EOWN_SAV']
num_CD = data[data['EOWN_CD'] == 1].count()['EOWN_CD']
print(num_IRA, num_401, num_SAV, num_CD)
#print(num_IRA)

126156 164928 353364 28404


In [58]:
ages = unique_data.groupby('TAGE')['TAGE']

accounts_by_age = {}
for (i, _) in ages:
    #print(i)
    if not i in accounts_by_age:
        accounts_by_age[i] = {}
    aged_data = data[data['TAGE'] == i]
    accounts_by_age[i]['num_SAV'] = int(aged_data[aged_data['EOWN_SAV'] == 1].count()['EOWN_SAV'])
    accounts_by_age[i]['num_401'] = int(aged_data[aged_data['EOWN_THR401'] == 1].count()['EOWN_THR401'])
    accounts_by_age[i]['num_CD'] = int(aged_data[aged_data['EOWN_CD'] == 1].count()['EOWN_CD'])
    accounts_by_age[i]['num_IRA'] = int(aged_data[aged_data['EOWN_IRAKEO'] == 1].count()['EOWN_IRAKEO'])

In [63]:
# Save to JSON for viz
import json

to_write = {
    "account_nums": [
        {
            "account": "IRA",
            "num": int(num_IRA)
        },
        {
            "account": "401k",
            "num": int(num_401)
        },
        {
            "account": "Savings",
            "num": int(num_SAV)
        },
        {
            "account": "CD",
            "num": int(num_CD)
        }
    ],
    "accounts_by_age": []
}

for i in accounts_by_age:
    for x in accounts_by_age[i]:
        to_write['accounts_by_age'].append({"age": i, "account": x, "num": accounts_by_age[i][x]})

print(to_write)
    
json_object = json.dumps(to_write, indent=4)
with open("sipp.json", "w") as outfile:
    outfile.write(json_object)

{'account_nums': [{'account': 'IRA', 'num': 126156}, {'account': '401k', 'num': 164928}, {'account': 'Savings', 'num': 353364}, {'account': 'CD', 'num': 28404}], 'accounts_by_age': [{'age': 15, 'account': 'num_SAV', 'num': 1176}, {'age': 15, 'account': 'num_401', 'num': 24}, {'age': 15, 'account': 'num_CD', 'num': 48}, {'age': 15, 'account': 'num_IRA', 'num': 60}, {'age': 16, 'account': 'num_SAV', 'num': 2316}, {'age': 16, 'account': 'num_401', 'num': 0}, {'age': 16, 'account': 'num_CD', 'num': 72}, {'age': 16, 'account': 'num_IRA', 'num': 96}, {'age': 17, 'account': 'num_SAV', 'num': 2640}, {'age': 17, 'account': 'num_401', 'num': 24}, {'age': 17, 'account': 'num_CD', 'num': 60}, {'age': 17, 'account': 'num_IRA', 'num': 108}, {'age': 18, 'account': 'num_SAV', 'num': 2556}, {'age': 18, 'account': 'num_401', 'num': 0}, {'age': 18, 'account': 'num_CD', 'num': 96}, {'age': 18, 'account': 'num_IRA', 'num': 48}, {'age': 19, 'account': 'num_SAV', 'num': 2964}, {'age': 19, 'account': 'num_401