# CAPSTONE TWO: NEGATIVE INCOME TAX EXPERIMENTS

## IMPORTS

In [6]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [7]:
gary_df = pd.read_csv('raw_gary_df.csv', dtype=str)

## DATA CLEANING

In [None]:
gary_df.dropna(inplace=True)

### WIDE DATA -> LONG DATA

In [None]:
wide_cols = [col for col in gary_df.columns if '-' in col]
stubs48_split = []
stubs42_split = []
stubs9_split = []
stubs16_split = []
stubs5_split = []
id_cols = gary_df.drop(wide_cols, axis =1).columns
cols_48 = [col for col in gary_df.columns if '-48' in col]
cols_42 = [col for col in gary_df.columns if '-42' in col]
cols_9 = [col for col in gary_df.columns if '-9' in col]
cols_16 = [col for col in gary_df.columns if '-16' in col]
cols_5 = [col for col in gary_df.columns if '-5' in col]

In [None]:
def chunkstring(string, length):
    return [string[0+i:length+i] for i in range(0, len(string), length)]
for column in wide_cols:
    if '-48' in column:
        stubs48_split.append(column.split('-')[0])
        leng = int(gary_df[column].str.len().unique() /48)
        gary_df[column] = gary_df[column].apply(chunkstring, args=[leng])
        for i in range(48):
            gary_df[column[:-4] + str(i+1)] = gary_df[column].apply(lambda x: x[i])
    if '-42' in column:
        stubs42_split.append(column.split('-')[0])
        leng = int(gary_df[column].str.len().unique() /42)
        gary_df[column] = gary_df[column].apply(chunkstring, args=[leng])
        for i in range(42):
            gary_df[column[:-4] + str(i+1)] = gary_df[column].apply(lambda x: x[i])
    if '-9' in column:
        stubs9_split.append(column.split('-')[0])
        leng = int(gary_df[column].str.len().unique() /9)
        gary_df[column] = gary_df[column].apply(chunkstring, args=[leng])
        for i in range(9):
            gary_df[column[:-3] + str(i+1)] = gary_df[column].apply(lambda x: x[i])  
    if '-16' in column:
        stubs16_split.append(column.split('-')[0])
        leng = int(gary_df[column].str.len().unique() /16)
        gary_df[column] = gary_df[column].apply(chunkstring, args=[leng])
        for i in range(16):      
            gary_df[column[:-3] + str(i+1)] = gary_df[column].apply(lambda x: x[i])
    if '-73' in column:
        stubs5_split.append(column.split('-')[0])
        leng = int(gary_df[column].str.len().unique() /5)
        gary_df[column] = gary_df[column].apply(chunkstring, args=[leng])
        for i in range(5):
            gary_df[column[:-3] + str(i+1)] = gary_df[column].apply(lambda x: x[i])

In [None]:
gary_df.drop(wide_cols,axis=1, inplace=True)

In [None]:
stubs48 = [x[:-1] for x in stubs48_split]
stubs42 = [x[:-1] for x in stubs42_split]
stubs9 = [x[:-1] for x in stubs9_split]
stubs16 = [x[:-1] for x in stubs16_split]
stubs5 = [x[:-1] for x in stubs5_split]
stubs = [stubs48, stubs42, stubs9, stubs16, stubs5]
id_list = list(id_cols)


In [None]:
def stubs_sorter(df, stubs_list):
    cols_list = []
    for stub in stubs_list:
        lengths = set([len(stub)+1, len(stub)+2])
        cols_list += [col for col in df.columns if stub in col and len(col) in lengths]
    if any('48' in col for col in cols_list):
        special_case = ['EMPSTAT1', 'TYPWRKR7', 'TYPWRKR3', 'TYPWRKR8', 'EMPSTAT7', 'EMPSTAT9',
           'TYPWRKR1', 'EMPSTAT8', 'TYPWRKR6', 'TYPWRKR9', 'EMPSTAT2', 'EMPSTAT4',
           'TYPWRKR2', 'EMPSTAT3', 'TYPWRKR5', 'EMPSTAT6', 'EMPSTAT5', 'TYPWRKR4']
        for col in special_case:
            cols_list.remove(col)
    #cols_list = [col for col in gary_df.columns if stubs48[0] in col]
    cols_list.append('PERNUM')
    #gary_df[cols_list]
    #gary_df['id'] = gary_df.index
    df_long = pd.wide_to_long(gary_df[cols_list], stubnames=stubs_list, i ='PERNUM', j="period")
    return df_long


In [None]:
long48 = stubs_sorter(gary_df, stubs48)
long42 = stubs_sorter(gary_df, stubs42)
long16 = stubs_sorter(gary_df, stubs16)
long9 = stubs_sorter(gary_df, stubs9)
long5 = stubs_sorter(gary_df, stubs5)
longs = [long48, long42, long16, long9, long5]
long5

In [None]:
for df in longs:
    df.reset_index(inplace=True)
    df.drop_duplicates(inplace=True)

In [None]:
map9 = {1 : 1, 2 : 14, 3 : 18, 4 : 22, 5 : 26, 6 : 31, 7 : 35, 8 : 38, 9 : 43}
map5 = {91 : 1, 92 : 13, 93 : 25, 94 : 37, 95 : 48}
long16['period'] = long16['period']*4
long9['period'] = long9['period'].map(map9)
long5['period'] = long5['period'].map(map5)


In [None]:
long9.sort_values(by = ['PERNUM', 'period'])

In [None]:
long5.sort_values(by = ['PERNUM', 'period'])

In [None]:
gary_final = pd.merge(long5, long9, on=['PERNUM', 'period'])
gary_final.head(500)

In [None]:
gary_final.loc[gary_final['PERNUM'] =='500001']

### HANDLING NULLS

dropping families with no people and people with no families (this was due to a record-keeping error on the part of the experimenters. Families starting with number 4 are supposed to be in the Sacramento file, not Gary).

Converting some of the more-common missing data codes

In [None]:
gary_df.replace(['9997', '9999','9993','9994', '97', '93'], np.NaN, inplace=True)

dropping the columns with > 75% of their entries being left blank

In [None]:
def percent_miss(df):
    # returns the percent of entries that are None in each column.
   return df.isnull().sum()/df.isnull().count()
bad_cols = gary_df.loc[:,(percent_miss(gary_df) > 0.75)].columns
gary_df.drop(bad_cols, axis=1, inplace= True)

ATTDATE stands for attrition date, meaning what date the family left the experiment before it ended. These families left because they either moved away, stopped responding to experimenters, or the active filing member passed. They're being dropped here as we are interested in effects of welfare over time and these cutoff early.

In [None]:
gary_df = gary_df.loc[gary_df['ATTDATE'] == '00000',:]
gary_df.drop(['ATTDATE', 'FAMNUM'], axis=1, inplace=True)

## Encoding

Many of the comments within this section are ideas for further analysis, or methods of data cleaning attempted that either failed or were too large of a time sink to complete.

In [None]:
#gary_df = gary_df.loc[gary_df['TREATLEV'] != '0']
#gary_control_df = gary_df.loc[gary_df['TREATLEV'] == '0']

In [None]:
gary_df.set_index('PERNUM', inplace=True)
gary_simp_df = pd.get_dummies(gary_df['TREATLEV'], drop_first=True)
#gary_df.drop(['TREATLEV'], axis=1, inplace=True)
col_dict = {1:'TREATLEV_1', 2: 'TREATLEV_2', 3:'TREATLEV_3', 4:'TREATLEV_4'}
gary_simp_df.rename(columns = col_dict, inplace=True)

In [None]:
gary_simp_df = pd.concat([gary_simp_df, pd.get_dummies(gary_df['POVLEV'], drop_first=True)], axis=1)
#gary_df.drop(['POVLEV'], axis=1, inplace=True)
col_dict = {2:'POV_LEV_2', 3: 'POV_LEV_3', 4:'POV_LEV_4', 5:'POV_LEV_5'}
gary_simp_df.rename(columns = col_dict, inplace=True)

This section of comments was my attempt to take the time-data and parse it out to hopefully generate new rows of data from them. Given more time, I would greatly expand this section, as it has the most potential and would give me the tools to make good features.

In [None]:
'''def chunkstring(string, length):
    return [string[0+i:length+i] for i in range(0, len(string), length)]
for column in gary_df.columns:
    if '-48' in column:
        leng = int(gary_df[column].str.len().unique() /48)
        gary_df[column] = gary_df[column].apply(chunkstring, args=[leng])
    if '-43' in column:
        leng = int(gary_df[column].str.len().unique() /43)
        gary_df[column] = gary_df[column].apply(chunkstring, args=[leng])
    if '-42' in column:
        leng = int(gary_df[column].str.len().unique() /42)
        gary_df[column] = gary_df[column].apply(chunkstring, args=[leng])
    if '-9' in column:
        leng = int(gary_df[column].str.len().unique() /9)
        gary_df[column] = gary_df[column].apply(chunkstring, args=[leng])
 '''       

In [None]:
'''
for column in gary_df.columns:
    if '-48' in column:
        leng = int(gary_df[column].str.len().unique() /48)
        basename = column
        for i in range(48):
            gary_df[basename+'-Month'+str(i)] = gary_df[column][i:i+leng]
gary_df.head()
'''

Renaming the column for the sake of ease in coding.

In [None]:
#gary_df.set_index('PERNUM', inplace=True)
#periodic_columns =['SSI1-48', 'TTI1-48', 'SS1-48', 'VA1-48', 'MISINC1-48',
#                   'OTHINC1-48', 'JOBINC1-48', 'DAYINC1-48', 'OJINC1-48',
#                  'UEMBEN1-48', 'STRKWC1-48']
gary_df.rename(columns= {'EMPSTAT1-9': 'EMPSTAT'}, inplace = True)
gary_df.info

Ruling out persons for which employment status data was never collected. 

In [None]:
gary_simp_df['EMPSTAT'] = gary_df.loc[gary_df.EMPSTAT.str.contains('00|01|02', regex=True),'EMPSTAT']

In [None]:
gary_simp_df.dropna(axis=0, inplace=True)

Unemployed + Actively seeking work, Employed -> in the labor force -> 1   
Unemployed + not actively seeking work -> not in labor force -> 0

In [None]:
gary_simp_df.loc[(gary_simp_df.EMPSTAT.str.contains('(01)', regex=True)),'EMPSTAT']= '1'
gary_simp_df.loc[(gary_simp_df.EMPSTAT.str.contains('(00)', regex=True)),'EMPSTAT']= '1'
#gary_simp_df.loc[(gary_simp_df.EMPSTAT.str.contains('(00)(02)', regex=True)),'EMPSTAT']= 0


In [None]:
gary_simp_df.loc[gary_simp_df['EMPSTAT'] != '1', 'EMPSTAT'] ='0'

In [None]:
gary_simp_df

## MODELLING

In [None]:
X= gary_simp_df.drop('EMPSTAT', axis=1)
Y= gary_simp_df['EMPSTAT']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state=2, stratify = Y)
print(y_train, y_test, X_train, X_test)

## Dummy-test

In [None]:
dummy = DummyClassifier(strategy = 'most_frequent')
dummy.fit(X_train, y_train)
dummy.score(X_test, y_test)

In [None]:
dummy_report = classification_report(y_test, dummy.predict(X_test), target_names = ['Not in Labor', 'In Labor']
print(dummy_report)

In [None]:
lr = LogisticRegression()
grid = GridSearchCV(estimator=lr,\
                   param_grid = { \
                                'C' : np.arange(0.05, 1.0, .05),\
                                'penalty' : ['l2'],\
                                'max_iter' : np.arange(500, 5000, 500)},
                   verbose = 2)
grid.fit(X_train, y_train)

In [None]:
print(classification_report(y_test, grid.predict(X_test), target_names=['Not in Labor', 'In Labor']))
print(confusion_matrix(y_test, grid.predict(X_test)))

**IMPORTANT NOTE:** Logistic Regression does *no* better than guessing the most frequent.

## RANDOM FOREST

In [None]:
rfc = ensemble.RandomForestClassifier()
grid = GridSearchCV(estimator = rfc,
                         param_grid={\
                                    'max_depth' : [1, 2, 3],\
                                    'criterion':['gini', 'entropy'],\
                                    'min_samples_split' : np.arange(0.05,1.0, 0.05),\
                                             },
                   verbose=2)
grid.fit(X_train, y_train)

In [None]:
print(classification_report(y_test, grid.predict(X_test), target_names=['Not in Labor', 'In Labor']))

**IMPORTANT NOTE** Random Forest doest *no* better than guessing the most frequent

In [None]:
clf = ensemble.GradientBoostingClassifier()
rand = RandomizedSearchCV(estimator=clf,\
                   param_distributions = { \
                                'n_estimators' : np.arange(500, 1000, 250),\
                                'max_depth' : np.arange(1,3),\
                                'learning_rate' : np.arange(0.1, .90, 0.1)})
rand.fit(X_train, y_train)
print(classification_report(y_test, rand.predict(X_test), target_names=['Not in Labor', 'In Labor']))

Gradient Boosting does tremendously better at predicting not in the labor force, and thus is the best model of the bunch. 

# FURTHER CONSIDERATIONS

In addition to the comments throughout this notebook better analysis would come from:  
1. Parsing out the 1-48, 1-43, 1-42, 1-9 columns and making them into rows by adding a column for month. The index could then be Person, Month for the data frame. 
2. Using ffill to patch up a lot of the NA's that are either dropped or ignored in this notebook.
3. Running the models with all the features (but not using GridSearch/RandomizedSearch), then doing some basic feature reduction (PCA, etc.)
4. Reconsidering the structure of the categorical data TREATLEV, POVLEV.

In [None]:
#gary_df['NOTINFR'] = gary_df['EMPINT'].str.contains('02')
#gary_df['NOTINFR'] = gary_df['NOTINFR'].astype(int)
#gary_df['EMPGAIN'] = gary_df['EMPINT'].str.contains('(00)(01)', regex=True)

In [None]:
#gary_df['EMPLOSS'] = gary_df['EMPINT'].str.contains('(01)(00)', regex=True)

In [None]:
#gary_df['EMPGAIN'] = gary_df.EMPGAIN.astype(int)

In [None]:
#gary_df['EMPLOSS'] = gary_df.EMPLOSS.astype(int)

In [None]:
#columns_for_later = ['EMPLOSS', 'EMPGAIN']
#gary_df.drop(columns_for_later, axis=1, inplace=True)

In [None]:
#gary_df['EMP'] = gary_df['EMPINT'].str.contains('01')
#gary_df['EMP'] = gary_df.EMP.astype(int)