In [9]:

import pandas as pd
import numpy as np
import metapack as mp
from pathlib import Path
from statsmodels.formula.api import ols
import seaborn as sns
import matplotlib.pyplot as plt
from sdipylib.plot import  source_attribution

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import  train_test_split
from sklearn.metrics import classification_report


# /Users/eric/opt/anaconda3/envs/data/lib/python3.7/site-packages/pandas/plotting/_tools.py:307: MatplotlibDeprecationWarning: 
# The rowNum attribute was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use ax.get_subplotspec().rowspan.start instead.
#   layout[ax.rowNum, ax.colNum] = ax.get_visible()
import warnings
warnings.simplefilter("ignore")

large = 22; med = 16; small = 12
params = {'axes.titlesize': large,
          'legend.fontsize': med,
          'figure.figsize': (16, 10),
          'axes.labelsize': med,
          'axes.titlesize': med,
          'xtick.labelsize': med,
          'ytick.labelsize': med,
          'figure.titlesize': large}
plt.rcParams.update(params)
plt.style.use('seaborn-whitegrid')
sns.set_style("white")
%matplotlib inline

%run weights.py
%run lib.py 

source = "Survey of Income and Program Participation, 2018"

In [10]:
sipp = mp.multi_open('census.gov-sipp-inequality', print_ref=True)
sipp

Opening:  index:census.gov-sipp-inequality


In [11]:
df = sipp.resource('sipp_18').dataframe()

In [12]:
sipp.resource('sipp_18')

Header,Type,Description
ssuid,integer,"Sample unit identifier. This identifier is created by scrambling together PSU, Sequence #1, Sequence #2, and the Frame Indicator for a case. It may be used in matching sample units from different waves."
spanel,integer,Panel year
swave,integer,Wave number of interview
efood1,integer,The food you bought did not last?
efood6,integer,"In 2017, were you ever hungry but didn't eat because there wasn't enough money for food?"
eawbsafe,integer,Is ... neighborhood safe from crime?
pnum,integer,Person number
ems,integer,"Is ... currently married, widowed, divorced, separated, or never married?"
erelrpe,integer,Household relationship (detailed categories)
esex,integer,Sex of this person


In [99]:
# ERACE
#    1. White alone
#    2. Black alone
#    3. Asian alone
#    4. Residual 

# New wmean that uses the weight variable name for SIPP
def wmean_(df, column_name):
    """Calculate the weighted mean of a list."""

    return wmean(df, column_name, 'wpfinwgt')

bc = pd.CategoricalDtype(ordered=True)

t = df.groupby('ssuid').first()

t['erace'] = t.erace.astype('category').cat.rename_categories({1:'white',2:'black',3:'asian',4:'other'})
t['rfamkind'] = t.rfamkind.astype(bc).cat.rename_categories({1:'married',2:'f_head',3:'m_head'})

t['in_poverty'] = (t.tfcyincpov<1).astype(int)
t['is_married'] = (t.rfamkind == 'married').astype(int)

dfp = t


Unnamed: 0_level_0,rp_0,rp_1,rp_2,rp_3,rp_4,rp_5,rp_6,rp_7,rp_8,rp_9
ssuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
11413607018,0,0,0,0,0,0,0,0,0,0
11413613418,0,0,0,0,0,0,0,0,0,0
11413646518,0,1,0,0,0,0,0,0,0,0
11428574618,1,0,0,0,0,0,0,0,0,0
11428577018,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
91092598838318,1,0,0,0,0,0,0,0,0,0
91092598850518,1,0,0,0,0,0,0,0,0,0
91092598851018,0,0,1,0,0,0,0,0,0,0
91092598856518,1,0,0,0,0,0,0,0,0,0


In [22]:
wmean_(t, 'in_poverty')

0.1393930083746234

In [17]:
t.groupby('erace').apply(wmean_,'in_poverty')

erace
white    0.120612
black    0.232951
asian    0.138576
other    0.209234
dtype: float64

In [18]:
t.groupby(['erace','rfamkind']).apply(wmean_,'in_poverty').unstack()

rfamkind,married,f_head,m_head
erace,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
white,0.045885,0.239585,0.117343
black,0.072493,0.30005,0.159401
asian,0.077459,0.152558,0.222944
other,0.067944,0.268196,0.208447


In [19]:
t.groupby(['erace','is_married']).apply(wmean_,'in_poverty').unstack()

is_married,0,1
erace,Unnamed: 1_level_1,Unnamed: 2_level_1
white,0.195327,0.045885
black,0.293437,0.072493
asian,0.239469,0.077459
other,0.297934,0.067944


In [24]:
t.groupby(['rfamkind', 'rfpersons', 'erace']).apply(wmean_,'in_poverty').unstack()

Unnamed: 0_level_0,erace,white,black,asian,other
rfamkind,rfpersons,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
married,1,0.132982,0.322921,0.140457,0.249806
married,2,0.03478,0.046636,0.097754,0.050941
married,3,0.035628,0.035485,0.064701,0.031614
married,4,0.046907,0.061548,0.050324,0.042975
married,5,0.08472,0.117417,0.096986,0.161893
married,6,0.100039,0.183647,0.060459,0.086031
married,7,0.142306,0.132237,0.091689,0.0
married,8,0.126457,0.640325,0.24102,0.0
married,9,0.338257,0.572859,0.0,0.356411
married,10,0.136929,0.0,,


In [34]:
x = t.groupby(['rfrelu18', 'rfamkind',  'erace']).apply(wmean_,'in_poverty').unstack()
x = x.loc[[0,1,2,3]] # Only enough records for less than 4 children
x

Unnamed: 0_level_0,erace,white,black,asian,other
rfrelu18,rfamkind,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,married,0.034253,0.04853,0.082612,0.051995
0,f_head,0.094155,0.12144,0.067435,0.163869
0,m_head,0.073191,0.129253,0.27841,0.119375
1,married,0.042173,0.042559,0.05117,0.036799
1,f_head,0.257078,0.307391,0.209442,0.239746
1,m_head,0.144811,0.189385,0.112166,0.289295
2,married,0.055448,0.073876,0.067846,0.057046
2,f_head,0.298093,0.293104,0.151929,0.295964
2,m_head,0.179817,0.155754,0.0,0.420478
3,married,0.10019,0.155847,0.13512,0.194413


In [46]:
x.loc[([0,1,2,3],'married'),:]

Unnamed: 0_level_0,erace,white,black,asian,other
rfrelu18,rfamkind,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,married,0.034253,0.04853,0.082612,0.051995
1,married,0.042173,0.042559,0.05117,0.036799
2,married,0.055448,0.073876,0.067846,0.057046
3,married,0.10019,0.155847,0.13512,0.194413


In [47]:
x.loc[([0,1,2,3],'f_head'),:]

Unnamed: 0_level_0,erace,white,black,asian,other
rfrelu18,rfamkind,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,f_head,0.094155,0.12144,0.067435,0.163869
1,f_head,0.257078,0.307391,0.209442,0.239746
2,f_head,0.298093,0.293104,0.151929,0.295964
3,f_head,0.43167,0.543238,0.361868,0.514708


In [48]:
x.loc[([0,1,2,3],'m_head'),:]

Unnamed: 0_level_0,erace,white,black,asian,other
rfrelu18,rfamkind,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,m_head,0.073191,0.129253,0.27841,0.119375
1,m_head,0.144811,0.189385,0.112166,0.289295
2,m_head,0.179817,0.155754,0.0,0.420478
3,m_head,0.11813,0.529037,,0.0


# Regression


In [100]:
dfp['rfrelu18'] = dfp['rfrelu18'].astype('category')
dfp['rfrelu18'].value_counts()

0    9046
1    3412
2    2664
3    1092
4     339
5      99
6      37
7      15
8       4
9       2
Name: rfrelu18, dtype: int64

In [101]:

dfs = dfp.sample(1_000_000, replace=True, weights=dfp.wpfinwgt)
dfs.shape

(1000000, 79)

In [115]:
from category_encoders import LeaveOneOutEncoder, TargetEncoder, OneHotEncoder, OrdinalEncoder, BinaryEncoder

enc = OneHotEncoder()

x_cols = ['rfrelu18', 'rfamkind',  'erace']
y_col = 'in_poverty'

t = dfs[x_cols + [y_col] ].dropna()

y = t[y_col]
X = enc.fit_transform(t[x_cols], y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=101)

logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)


print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95    289999
           1       0.73      0.07      0.12     32060

    accuracy                           0.90    322059
   macro avg       0.82      0.53      0.54    322059
weighted avg       0.89      0.90      0.87    322059



In [116]:
logmodel.score(X_test, y_test)

0.9046416960867418

Try to improve the model by including an equal number of records in each target class. 

In [117]:
t1 = dfp[dfp.in_poverty == 1].sample(500_000, replace=True, weights=dfp.wpfinwgt)
t2 = dfp[dfp.in_poverty == 0].sample(500_000, replace=True, weights=dfp.wpfinwgt)
t = pd.concat([t1, t2])

t = t[x_cols + [y_col] ].dropna()

y = t[y_col]
X = enc.fit_transform(t[x_cols], y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=101)

logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)

print(classification_report(y_test,predictions))


              precision    recall  f1-score   support

           0       0.76      0.79      0.77    168357
           1       0.67      0.63      0.65    114434

    accuracy                           0.72    282791
   macro avg       0.71      0.71      0.71    282791
weighted avg       0.72      0.72      0.72    282791



In [120]:
t = dfp[x_cols + [y_col] ].dropna()

y = t[y_col]
X = enc.fit_transform(t[x_cols], y)

predictions = logmodel.predict(X)
print(classification_report(y,predictions))

              precision    recall  f1-score   support

           0       0.95      0.77      0.85     14974
           1       0.24      0.64      0.35      1736

    accuracy                           0.75     16710
   macro avg       0.59      0.70      0.60     16710
weighted avg       0.87      0.75      0.80     16710

