In [7]:

import pandas as pd
import numpy as np
import metapack as mp
from pathlib import Path
from statsmodels.formula.api import ols
import seaborn as sns
import matplotlib.pyplot as plt
from sdipylib.plot import  source_attribution

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import  train_test_split
from sklearn.metrics import classification_report


# /Users/eric/opt/anaconda3/envs/data/lib/python3.7/site-packages/pandas/plotting/_tools.py:307: MatplotlibDeprecationWarning: 
# The rowNum attribute was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use ax.get_subplotspec().rowspan.start instead.
#   layout[ax.rowNum, ax.colNum] = ax.get_visible()
import warnings
warnings.simplefilter("ignore")

large = 22; med = 16; small = 12
params = {'axes.titlesize': large,
          'legend.fontsize': med,
          'figure.figsize': (16, 10),
          'axes.labelsize': med,
          'axes.titlesize': med,
          'xtick.labelsize': med,
          'ytick.labelsize': med,
          'figure.titlesize': large}
plt.rcParams.update(params)
plt.style.use('seaborn-whitegrid')
sns.set_style("white")
%matplotlib inline

%run weights.py
%run lib.py 

source = "Survey of Income and Program Participation, 2018"

In [2]:
sipp = mp.multi_open('census.gov-sipp-inequality', print_ref=True)
sipp

Opening:  index:census.gov-sipp-inequality


In [3]:
df = sipp.resource('sipp_18').dataframe()

In [4]:
sipp.resource('sipp_18')

Header,Type,Description
ssuid,integer,"Sample unit identifier. This identifier is created by scrambling together PSU, Sequence #1, Sequence #2, and the Frame Indicator for a case. It may be used in matching sample units from different waves."
spanel,integer,Panel year
swave,integer,Wave number of interview
efood1,integer,The food you bought did not last?
efood6,integer,"In 2017, were you ever hungry but didn't eat because there wasn't enough money for food?"
eawbsafe,integer,Is ... neighborhood safe from crime?
pnum,integer,Person number
ems,integer,"Is ... currently married, widowed, divorced, separated, or never married?"
erelrpe,integer,Household relationship (detailed categories)
esex,integer,Sex of this person


In [8]:
# ERACE
#    1. White alone
#    2. Black alone
#    3. Asian alone
#    4. Residual 

# New wmean that uses the weight variable name for SIPP
def wmean_(df, column_name):
    """Calculate the weighted mean of a list."""

    return wmean(df, column_name, 'wpfinwgt')

bc = pd.CategoricalDtype(ordered=True)

t = df.groupby('ssuid').first()

t['erace'] = t.erace.astype('category').cat.rename_categories({1:'white',2:'black',3:'asian',4:'other'})
t['rfamkind'] = t.rfamkind.astype(bc).cat.rename_categories({1:'married',2:'f_head',3:'m_head'})

t['in_poverty'] = (t.tfcyincpov<1).astype(int)
t['is_married'] = (t.rfamkind == 'married').astype(int)

dfp = t
dfp.head()


Unnamed: 0_level_0,spanel,swave,efood1,efood6,eawbsafe,pnum,ems,erelrpe,esex,eorigin,...,thval_ast,thdebt_ast,thnetworth,tptrninc,tptotinc,tfcyincpov,thcyincpov,raceeth,in_poverty,is_married
ssuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11413607018,2018,1,3,,2,101,6,2,1,1,...,0,0,0,0,0,0.0,0.0,1738.0,1,0
11413613418,2018,1,3,,2,101,4,2,2,2,...,30330,0,100000,102730,72400,30330.0,0.0,2972.0,0,0
11413646518,2018,1,1,1.0,2,101,4,7,2,1,...,0,0,125000,125000,0,125000.0,0.0,3333.0,0,1
11428574618,2018,1,3,,2,101,1,1,2,2,...,59125,0,75000,156250,38000,118250.0,0.0,2370.0,0,1
11428577018,2018,1,3,,1,101,1,1,1,2,...,2983246,0,150000,3797801,0,3797801.0,0.0,4413.0,0,1


In [9]:
wmean_(t, 'in_poverty')

0.16368062365911898

In [10]:
t.groupby('erace').apply(wmean_,'in_poverty')

erace
white    0.136650
black    0.328201
asian    0.096939
other    0.241585
dtype: float64

In [11]:
t.groupby(['erace','rfamkind']).apply(wmean_,'in_poverty').unstack()

rfamkind,0,married,f_head,m_head
erace,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
white,0.173966,0.084179,0.253101,0.144807
black,0.370202,0.193114,0.40967,0.264904
asian,0.13157,0.085288,0.063001,0.039267
other,0.281409,0.127859,0.368614,0.337515


In [12]:
t.groupby(['erace','is_married']).apply(wmean_,'in_poverty').unstack()

is_married,0,1
erace,Unnamed: 1_level_1,Unnamed: 2_level_1
white,0.188002,0.084179
black,0.37785,0.193114
asian,0.115492,0.085288
other,0.310389,0.127859


In [13]:
t.groupby(['rfamkind', 'rfpersons', 'erace']).apply(wmean_,'in_poverty').unstack()

Unnamed: 0_level_0,erace,white,black,asian,other
rfamkind,rfpersons,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,0.173966,0.370202,0.13157,0.281409
married,2,0.059713,0.180891,0.081831,0.091067
married,3,0.09792,0.184027,0.061536,0.181922
married,4,0.107689,0.199805,0.110302,0.099043
married,5,0.123529,0.200991,0.086157,0.152521
married,6,0.120382,0.122523,0.135568,0.237702
married,7,0.179143,0.471075,0.0,0.094389
married,8,0.120901,0.30915,0.0,0.0
married,9,0.134064,0.505035,0.343093,0.0
married,10,0.0,0.0,,


In [14]:
x = t.groupby(['rfrelu18', 'rfamkind',  'erace']).apply(wmean_,'in_poverty').unstack()
x = x.loc[[0,1,2,3]] # Only enough records for less than 4 children
x

Unnamed: 0_level_0,erace,white,black,asian,other
rfrelu18,rfamkind,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0.213899,0.356094,0.13641,0.338398
0,married,0.061452,0.171249,0.074179,0.098181
0,f_head,0.152136,0.263876,0.037092,0.298871
0,m_head,0.150236,0.197571,0.056497,0.395458
1,0,0.280797,0.460107,0.328439,0.198425
1,married,0.120105,0.175777,0.081777,0.167956
1,f_head,0.304483,0.447595,0.113678,0.376
1,m_head,0.137985,0.346767,0.0,0.256478
2,0,0.171473,0.297667,0.0,0.139873
2,married,0.109186,0.259764,0.100545,0.141242


In [15]:
x.loc[([0,1,2,3],'married'),:]

Unnamed: 0_level_0,erace,white,black,asian,other
rfrelu18,rfamkind,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,married,0.061452,0.171249,0.074179,0.098181
1,married,0.120105,0.175777,0.081777,0.167956
2,married,0.109186,0.259764,0.100545,0.141242
3,married,0.135896,0.15516,0.09536,0.217445


In [16]:
x.loc[([0,1,2,3],'f_head'),:]

Unnamed: 0_level_0,erace,white,black,asian,other
rfrelu18,rfamkind,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,f_head,0.152136,0.263876,0.037092,0.298871
1,f_head,0.304483,0.447595,0.113678,0.376
2,f_head,0.31292,0.470525,0.0,0.406524
3,f_head,0.306419,0.465184,0.186282,0.463391


In [17]:
x.loc[([0,1,2,3],'m_head'),:]

Unnamed: 0_level_0,erace,white,black,asian,other
rfrelu18,rfamkind,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,m_head,0.150236,0.197571,0.056497,0.395458
1,m_head,0.137985,0.346767,0.0,0.256478
2,m_head,0.139592,0.324093,0.0,0.451136
3,m_head,0.180648,0.613685,,0.0


# Regression


In [18]:
dfp['rfrelu18'] = dfp['rfrelu18'].astype('category')
dfp['rfrelu18'].value_counts()

0    9046
1    3412
2    2664
3    1092
4     339
5      99
6      37
7      15
8       4
9       2
Name: rfrelu18, dtype: int64

In [19]:

dfs = dfp.sample(1_000_000, replace=True, weights=dfp.wpfinwgt)
dfs.shape

(1000000, 79)

In [20]:
from category_encoders import LeaveOneOutEncoder, TargetEncoder, OneHotEncoder, OrdinalEncoder, BinaryEncoder

enc = OneHotEncoder()

x_cols = ['rfrelu18', 'rfamkind',  'erace']
y_col = 'in_poverty'

t = dfs[x_cols + [y_col] ].dropna()

y = t[y_col]
X = enc.fit_transform(t[x_cols], y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=101)

logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)


print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.86      1.00      0.92    276348
           1       0.63      0.01      0.02     45403

    accuracy                           0.86    321751
   macro avg       0.75      0.50      0.47    321751
weighted avg       0.83      0.86      0.80    321751



In [21]:
logmodel.score(X_test, y_test)

0.8594378883049315

Try to improve the model by including an equal number of records in each target class. 

In [22]:
t1 = dfp[dfp.in_poverty == 1].sample(500_000, replace=True, weights=dfp.wpfinwgt)
t2 = dfp[dfp.in_poverty == 0].sample(500_000, replace=True, weights=dfp.wpfinwgt)
t = pd.concat([t1, t2])

t = t[x_cols + [y_col] ].dropna()

y = t[y_col]
X = enc.fit_transform(t[x_cols], y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=101)

logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)

print(classification_report(y_test,predictions))


              precision    recall  f1-score   support

           0       0.67      0.74      0.70    165042
           1       0.65      0.56      0.60    139680

    accuracy                           0.66    304722
   macro avg       0.66      0.65      0.65    304722
weighted avg       0.66      0.66      0.66    304722



In [23]:
t = dfp[x_cols + [y_col] ].dropna()

y = t[y_col]
X = enc.fit_transform(t[x_cols], y)

predictions = logmodel.predict(X)
print(classification_report(y,predictions))

              precision    recall  f1-score   support

           0       0.87      0.17      0.28     14376
           1       0.14      0.85      0.24      2334

    accuracy                           0.26     16710
   macro avg       0.51      0.51      0.26     16710
weighted avg       0.77      0.26      0.28     16710



In [41]:
# How often to famillies of each race eat with their children?
dfp['dummy'] = 1
t = dfp.groupby(['erace','edinrpar']).dummy.count().unstack()
t = t.divide(t.sum(axis=1), axis=0)
(t*100).round(2)

edinrpar,0,1,2,3,4,5,6,7
erace,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
white,1.26,0.83,3.01,4.73,6.04,9.16,4.47,70.5
black,2.99,0.75,3.93,5.05,5.51,7.76,2.34,71.68
asian,1.75,0.44,1.75,2.84,4.59,4.59,2.18,81.88
other,2.25,0.96,3.86,2.89,6.11,9.65,4.18,70.1


In [55]:
# A lot more blacks have educational debt than whites
t = dfp.copy()
t['toeddebtval'] = t.toeddebtval.fillna(0)
t['has_debt'] = t.toeddebtval > 0
t.groupby(['erace']).has_debt.mean()

erace
white    0.172993
black    0.238438
asian    0.157177
other    0.215947
Name: has_debt, dtype: float64