In [1]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, cross_val_score

import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore')

In [2]:
X_train = pd.read_csv('../data/X_train_h1n1.csv', index_col= 'respondent_id')
X_test = pd.read_csv('../data/X_test_h1n1.csv', index_col= 'respondent_id')
y_train = pd.read_csv('../data/y_train_h1n1.csv', index_col= 'respondent_id')
y_test = pd.read_csv('../data/y_test_h1h1.csv', index_col= 'respondent_id')

In [3]:
y_train = y_train['h1n1_vaccine']
y_test = y_test['h1n1_vaccine']

In [4]:
X_train_cat = X_train.select_dtypes('object')
X_train_num = X_train.select_dtypes(['float64', 'int64'])

cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])
num_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
])

transformer = ColumnTransformer([('categorical', cat_pipe, X_train_cat.columns),
                                 ('numerical', num_pipe, X_train_num.columns)])

# Random Forest Classifier

In [5]:
model_pipe = imbPipeline(steps=[
    ('trans', transformer),
    ('smote', SMOTE(random_state=42)),
    ('rfc', RandomForestClassifier(verbose=1, n_jobs=-2))
])

model_pipe.fit(X_train, y_train)

[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  36 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:    2.1s finished


Pipeline(steps=[('trans',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region',
       'census_msa'],
      dtype=...
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'op

In [6]:
rfc_cv_score = cross_val_score(model_pipe, X_train, y_train, n_jobs=-1, verbose=3)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   15.8s remaining:   23.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.3s finished


In [7]:
rfc_cv_score.mean()

0.8320519221168248

# RFC with Grid Search

In [8]:
model_pipe_2 = imbPipeline(steps=[
    ('trans', transformer),
    ('smote', SMOTE(random_state=42)),
    ('rfc', RandomForestClassifier(verbose=1, n_jobs=-2))
])

In [9]:
params = {
    'rfc__max_depth': list(range(10,100,10)),
    'rfc__criterion': ['gini', 'entropy'],
    'rfc__n_estimators': list(range(100,250,25)),
    'rfc__min_samples_leaf': list(range(2,10,2)),
    'rfc__min_samples_split': list(range(2,10,2)),
}

In [10]:
gs_rfc = GridSearchCV(model_pipe_2, params, n_jobs=-1, verbose=3, cv = 3)
gs_rfc.fit(X_train, y_train)

Fitting 3 folds for each of 1728 candidates, totalling 5184 fits


[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  36 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-2)]: Done 186 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-2)]: Done 225 out of 225 | elapsed:    3.2s finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('trans',
                                        ColumnTransformer(transformers=[('categorical',
                                                                         Pipeline(steps=[('impute',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_...
       'household_children'],
      dtype='object')

In [11]:
gs_rfc.best_params_

{'rfc__criterion': 'entropy',
 'rfc__max_depth': 50,
 'rfc__min_samples_leaf': 4,
 'rfc__min_samples_split': 4,
 'rfc__n_estimators': 225}

In [12]:
rfc_cv_1 = cross_val_score(gs_rfc.best_estimator_, X_train, y_train, n_jobs=-1, verbose=3)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   13.2s remaining:   19.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.4s finished


In [13]:
rfc_cv_1.mean()

0.835846230654019

After running a Random Forest Classifier I was able to get an accuracy score of $0.836$. This score is currently the highest achieved, $0.014$ higher than the SVC, $0.073$ higher than the KNN Classifier, and $0.059$ higher than the Logistic Regression. Next, I will run a Gradient Boost Classifier and see if it is higher than Random Forest Classifier. 

## Gradiant Boost Classifier

In [14]:
model_pipe_3 = imbPipeline(steps=[
    ('trans', transformer),
    ('smote', SMOTE(random_state=42)),
    ('gbc', GradientBoostingClassifier(verbose=3))
])

In [15]:
model_pipe_3.get_params()

{'memory': None,
 'steps': [('trans',
   ColumnTransformer(transformers=[('categorical',
                                    Pipeline(steps=[('impute',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('ohe',
                                                     OneHotEncoder(handle_unknown='ignore',
                                                                   sparse=False))]),
                                    Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
          'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region',
          'census_msa'],
         dtype='object')),
                                   ('numerical',
                                    P...
          'behavioral_large_gatherings', 'behavioral_outside_home',
          'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
          'chronic_med_condition', 'child_u

In [16]:
params = {
    'gbc__learning_rate': [0.001, 0.01, 0.1, .5, .9],
    'gbc__n_estimators': list(range(100,250,50)),
    'gbc__min_samples_leaf': list(range(2,10,2)),
    'gbc__min_samples_split': list(range(2,10,2)),
    'gbc__max_features': list(range(0,200,50))
}

In [17]:
gs_gbc = GridSearchCV(model_pipe_3, params, n_jobs=-1, verbose=1, cv = 3)
gs_gbc.fit(X_train, y_train)

Fitting 3 folds for each of 960 candidates, totalling 2880 fits
      Iter       Train Loss   Remaining Time 
         1           1.3165           20.09s
         2           1.2542           18.99s
         3           1.2028           18.78s
         4           1.1593           18.95s
         5           1.1217           18.39s
         6           1.0898           18.40s
         7           1.0619           17.66s
         8           1.0358           17.62s
         9           1.0123           17.45s
        10           0.9943           17.16s
        11           0.9763           16.97s
        12           0.9600           16.87s
        13           0.9452           16.64s
        14           0.9316           16.40s
        15           0.9165           16.38s
        16           0.9036           16.14s
        17           0.8877           16.02s
        18           0.8770           16.00s
        19           0.8608           15.79s
        20           0.8503        

       183           0.4818            1.49s
       184           0.4815            1.40s
       185           0.4810            1.31s
       186           0.4807            1.22s
       187           0.4803            1.14s
       188           0.4800            1.05s
       189           0.4796            0.96s
       190           0.4791            0.87s
       191           0.4789            0.79s
       192           0.4786            0.70s
       193           0.4782            0.61s
       194           0.4781            0.52s
       195           0.4778            0.44s
       196           0.4776            0.35s
       197           0.4771            0.26s
       198           0.4769            0.17s
       199           0.4767            0.09s
       200           0.4764            0.00s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('trans',
                                        ColumnTransformer(transformers=[('categorical',
                                                                         Pipeline(steps=[('impute',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_...
       'opinion_seas_risk', 'opinion_seas_sick_from

In [18]:
gs_gbc.best_params_

{'gbc__learning_rate': 0.1,
 'gbc__max_features': 50,
 'gbc__min_samples_leaf': 2,
 'gbc__min_samples_split': 6,
 'gbc__n_estimators': 200}

In [19]:
gbc_cv_1 = cross_val_score(gs_gbc.best_estimator_, X_train, y_train, n_jobs=-1, verbose=3)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   30.3s remaining:   45.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   30.6s finished


In [20]:
gbc_cv_1.mean()

0.8381427858212682

The Gradient Boosting Classifier performed the best of all the models that have been created so far with a score of $0.838$. In the new two notebooks, I will run an XGBoost Classifier and a Neural Network and see if they perform any better than this model.