In [1]:
# package imports
import pandas as pd
from sklearn.model_selection import train_test_split
# from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
# import matplotlib.pyplot as plt 

In [2]:
# load data
aarp = pd.read_csv('Data/clean_AARP.csv')

In [3]:
# list vars
model_vars = ['AGE4_recode', 'D6_recode', 'D8_recode', 'D9C_recode', 'D18_recode', 'D26_recode', 'MARITAL_recode', 'EDUC4_recode', 'RACETHNICITY_recode', 'GENDER_recode', 'HHSIZE', 'INCOME_recode']

# aarp be aarp
aarp = aarp[model_vars]

aarp['RACETHNICITY_recode'] = pd.Categorical(aarp['RACETHNICITY_recode'])
aarp['GENDER_recode'] = pd.Categorical(aarp['GENDER_recode'])
aarp['MARITAL_recode'] = pd.Categorical(aarp['MARITAL_recode'])
aarp['D26_recode'] = pd.Categorical(aarp['D26_recode'])
aarp['D18_recode'] = pd.Categorical(aarp['D18_recode'])

# recode D9C to binary
# if you never feel SI then 1, else 2, 
# then 0, 1 binary
aarp['is_si'] = aarp['D9C_recode'].apply(lambda x: 1 if x == 1 else 1 if x == 2 else 3)
aarp['is_si'] = aarp['is_si'].apply(lambda x: 0 if x == 1 else 1)

In [4]:
# make decision tree object
clf = DecisionTreeClassifier(random_state=42)

In [5]:
# make our x and y
x = aarp[model_vars].drop('D9C_recode', axis=1)
y = aarp['is_si']

In [6]:
# train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [7]:
# decision tree regular fit
clf.fit(x_train,y_train)
clf.score(x_test, y_test)

0.61198738170347

In [8]:
# baseline
aarp['is_si'].value_counts()[0]/aarp['is_si'].value_counts().sum()
# 68% baseline..

0.6793336803748048

In [9]:
# 79%... what quantities?
aarp['is_si'].value_counts()

0    1305
1     616
Name: is_si, dtype: int64

In [10]:
# prepare grid search
param_grid = { 
    'max_leaf_nodes': range(2,1000,22),
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : range(1,100, 5),
    'criterion' :['gini', 'entropy', 'log_loss'],
    'max_features': ['sqrt', 'log2', None]
}

In [11]:
# do the grid search
CV_DT = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 5, n_jobs=-1) 
CV_DT.fit(x_train, y_train)

In [12]:
# score it
CV_DT.score(x_test, y_test)

0.6514195583596214

In [13]:
# 65... even grid search can't get it much better?

In [14]:
# what are the feature importances?
for importance, feature in zip(clf.feature_importances_, clf.feature_names_in_):
    print(f'{round(importance,3)}\t{feature}')

0.094	AGE4_recode
0.086	D6_recode
0.101	D8_recode
0.082	D18_recode
0.055	D26_recode
0.067	MARITAL_recode
0.073	EDUC4_recode
0.1	RACETHNICITY_recode
0.045	GENDER_recode
0.118	HHSIZE
0.179	INCOME_recode


In [15]:
# DT for each age category... regular decision tree
for ages in aarp['AGE4_recode'].unique():
    age_aarp = aarp.loc[aarp['AGE4_recode'] == ages]
    x = age_aarp.drop(['D9C_recode', 'AGE4_recode', 'is_si'], axis=1)
    y = age_aarp['is_si']
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
    DecisionTreeClassifier(random_state=42)
    clf.fit(x_train,y_train)
    print(f'---------- score for age {ages} {round(clf.score(x_test, y_test),3)}% -----------')
    
    for importance, feature in zip(clf.feature_importances_, clf.feature_names_in_):
        print(f'{round(importance,3)}\t{feature}')

---------- score for age 4 0.68% -----------
0.114	D6_recode
0.096	D8_recode
0.135	D18_recode
0.058	D26_recode
0.173	MARITAL_recode
0.063	EDUC4_recode
0.094	RACETHNICITY_recode
0.05	GENDER_recode
0.059	HHSIZE
0.157	INCOME_recode
---------- score for age 2 0.511% -----------
0.125	D6_recode
0.155	D8_recode
0.099	D18_recode
0.033	D26_recode
0.112	MARITAL_recode
0.072	EDUC4_recode
0.098	RACETHNICITY_recode
0.052	GENDER_recode
0.085	HHSIZE
0.17	INCOME_recode
---------- score for age 3 0.578% -----------
0.107	D6_recode
0.164	D8_recode
0.112	D18_recode
0.055	D26_recode
0.112	MARITAL_recode
0.055	EDUC4_recode
0.048	RACETHNICITY_recode
0.026	GENDER_recode
0.156	HHSIZE
0.165	INCOME_recode
---------- score for age 1 0.484% -----------
0.084	D6_recode
0.105	D8_recode
0.093	D18_recode
0.02	D26_recode
0.064	MARITAL_recode
0.14	EDUC4_recode
0.119	RACETHNICITY_recode
0.038	GENDER_recode
0.137	HHSIZE
0.199	INCOME_recode


In [16]:
# DT for each age category... regular decision tree

# prepare grid search
param_grid = { 
    'max_leaf_nodes': range(2,1000,22),
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : range(1,100, 5),
    'criterion' :['gini', 'entropy', 'log_loss'],
    'max_features': ['sqrt', 'log2', None]
}

for ages in aarp['AGE4_recode'].unique():
    age_aarp = aarp.loc[aarp['AGE4_recode'] == ages]
    x = age_aarp.drop(['D9C_recode', 'AGE4_recode', 'is_si'], axis=1)
    y = age_aarp['is_si']
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
    DecisionTreeClassifier(random_state=42)
    # do the grid search
    CV_DT = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 5, n_jobs=-1) 
    CV_DT.fit(x_train, y_train)
    print(f'---------- score for age {ages} {round(CV_DT.best_estimator_.score(x_test, y_test),3)}% -----------')
    
    for importance, feature in zip(CV_DT.best_estimator_.feature_importances_, CV_DT.feature_names_in_):
        print(f'{round(importance,3)}\t{feature}')

---------- score for age 4 0.81% -----------
0.0	D6_recode
0.0	D8_recode
0.0	D18_recode
0.0	D26_recode
1.0	MARITAL_recode
0.0	EDUC4_recode
0.0	RACETHNICITY_recode
0.0	GENDER_recode
0.0	HHSIZE
0.0	INCOME_recode
---------- score for age 2 0.624% -----------
0.237	D6_recode
0.227	D8_recode
0.124	D18_recode
0.0	D26_recode
0.118	MARITAL_recode
0.045	EDUC4_recode
0.122	RACETHNICITY_recode
0.026	GENDER_recode
0.024	HHSIZE
0.076	INCOME_recode
---------- score for age 3 0.646% -----------
0.089	D6_recode
0.129	D8_recode
0.077	D18_recode
0.143	D26_recode
0.147	MARITAL_recode
0.047	EDUC4_recode
0.0	RACETHNICITY_recode
0.0	GENDER_recode
0.192	HHSIZE
0.175	INCOME_recode
---------- score for age 1 0.535% -----------
0.204	D6_recode
0.075	D8_recode
0.044	D18_recode
0.101	D26_recode
0.081	MARITAL_recode
0.029	EDUC4_recode
0.022	RACETHNICITY_recode
0.052	GENDER_recode
0.175	HHSIZE
0.219	INCOME_recode


### same as above but upsampling training data

In [17]:
# make train and test set
aarp_train, aarp_test = train_test_split(aarp, test_size=0.33, random_state=42)

In [18]:
aarp_train['is_si'].value_counts()

0    883
1    404
Name: is_si, dtype: int64

In [19]:
# upsample train set
to_resample = aarp_train.loc[aarp['is_si'] ==1]
our_resample = to_resample.sample(n = 479, replace = True)
aarp_train_rebal = pd.concat([aarp_train, our_resample])

In [20]:
# manually create x,y test and trains
x_train = aarp_train_rebal.drop(['D9C_recode', 'AGE4_recode', 'is_si'], axis=1)
x_test = aarp_test.drop(['D9C_recode', 'AGE4_recode', 'is_si'], axis=1)
y_train = aarp_train_rebal['is_si']
y_test = aarp_test['is_si']

In [21]:
aarp_train_rebal['is_si'].value_counts()

0    883
1    883
Name: is_si, dtype: int64

In [22]:
# grid search it
CV_DT = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 5, n_jobs=-1) 
CV_DT.fit(x_train, y_train)

In [23]:
CV_DT.score(x_test, y_test)

0.6041009463722398

In [27]:
# DT for each age category
for ages in aarp['AGE4_recode'].unique():
    
    # grab aarp for single age group
    aged_aarp = aarp.loc[aarp['AGE4_recode'] == ages]
    
    # split into train and test
    aarp_train, aarp_test = train_test_split(aged_aarp, test_size=0.33, random_state=42)
    
    # upsample train set
    # make sure that the 0 count is larger
    if aarp_train['is_si'].value_counts()[0] - aarp_train['is_si'].value_counts()[1] > 0:
        amt_to_rebal = aarp_train['is_si'].value_counts()[0] - aarp_train['is_si'].value_counts()[1]
        
        to_resample = aarp_train.loc[aarp['is_si'] ==1]
        our_resample = to_resample.sample(n = amt_to_rebal, replace = True)
        aarp_train_rebal = pd.concat([aarp_train, our_resample])
    else:
        # make sure the number is positive and we recode 1 instead
        amt_to_rebal = (aarp_train['is_si'].value_counts()[0] - aarp_train['is_si'].value_counts()[1]) * -1
        to_resample = aarp_train.loc[aarp_train['is_si'] ==0]
        our_resample = to_resample.sample(n = amt_to_rebal, replace = True)
        aarp_train_rebal = pd.concat([aarp_train, our_resample])
        
    
    # manually create x,y test and trains
    x_train = aarp_train_rebal.drop(['D9C_recode', 'AGE4_recode', 'is_si'], axis=1)
    x_test = aarp_test.drop(['D9C_recode', 'AGE4_recode', 'is_si'], axis=1)
    y_train = aarp_train_rebal['is_si']
    y_test = aarp_test['is_si']
    
    # new decision tree
    clf = DecisionTreeClassifier(random_state=42)
    
    # grid to search
    param_grid = { 
    'max_leaf_nodes': range(2,1000,11),
    'max_features': ['auto', 'sqrt', 'log2'], 
    'max_depth' : range(1,100, 3),
    'criterion' :['gini', 'entropy', 'log_loss'],
    'max_features': ['sqrt', 'log2', None]
}
    # new grid search using decision tree and grid from above
    CV_DT = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 5, n_jobs=-1) 
    
    # let's 'train'
    CV_DT.fit(x_train, y_train)
    
    # print the results
    print(f'---------- score for age {ages}: {round(CV_DT.score(x_test, y_test),3)}% ----------')
    
    for importance, feature in zip(CV_DT.best_estimator_.feature_importances_, CV_DT.feature_names_in_):
        print(f'{round(importance,3)}\t{feature}')
        
    ###### print train and test set sizes
    print(f'training set size: \n{y_train.value_counts()}')
    print(f'testing set size:  \n{y_test.value_counts()}')
    ###### 

---------- score for age 4: 0.699% ----------
0.12	D6_recode
0.076	D8_recode
0.164	D18_recode
0.054	D26_recode
0.091	MARITAL_recode
0.056	EDUC4_recode
0.07	RACETHNICITY_recode
0.102	GENDER_recode
0.039	HHSIZE
0.227	INCOME_recode
training set size: 
1    253
0    253
Name: is_si, dtype: int64
testing set size:  
0    124
1     29
Name: is_si, dtype: int64
---------- score for age 2: 0.539% ----------
0.2	D6_recode
0.153	D8_recode
0.092	D18_recode
0.037	D26_recode
0.092	MARITAL_recode
0.06	EDUC4_recode
0.077	RACETHNICITY_recode
0.06	GENDER_recode
0.099	HHSIZE
0.13	INCOME_recode
training set size: 
0    241
1    241
Name: is_si, dtype: int64
testing set size:  
0    117
1     61
Name: is_si, dtype: int64
---------- score for age 3: 0.578% ----------
0.104	D6_recode
0.127	D8_recode
0.1	D18_recode
0.031	D26_recode
0.099	MARITAL_recode
0.083	EDUC4_recode
0.094	RACETHNICITY_recode
0.044	GENDER_recode
0.168	HHSIZE
0.15	INCOME_recode
training set size: 
1    221
0    221
Name: is_si, dtype: int