In [1]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, cross_val_score

import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
X_train = pd.read_csv('../data/X_train_h1n1.csv', index_col= 'respondent_id')
X_test = pd.read_csv('../data/X_test_h1n1.csv', index_col= 'respondent_id')
y_train = pd.read_csv('../data/y_train_h1n1.csv', index_col= 'respondent_id')
y_test = pd.read_csv('../data/y_test_h1h1.csv', index_col= 'respondent_id')

In [3]:
y_train = y_train['h1n1_vaccine']
y_test = y_test['h1n1_vaccine']

In [4]:
X_train_cat = X_train.select_dtypes('object')
X_train_num = X_train.select_dtypes(['float64', 'int64'])

cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])
num_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
])

transformer = ColumnTransformer([('categorical', cat_pipe, X_train_cat.columns),
                                 ('numerical', num_pipe, X_train_num.columns)])

In [5]:
model_pipe = imbPipeline(steps=[
    ('trans', transformer),
    ('smote', SMOTE(random_state=42)),
    ('rfc', RandomForestClassifier(verbose=1, n_jobs=-2))
])

model_pipe.fit(X_train, y_train)

[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done  28 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:    0.6s finished


Pipeline(steps=[('trans',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region',
       'census_msa'],
      dtype=...
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'op

In [6]:
rfc_cv_score = cross_val_score(model_pipe, X_train, y_train, n_jobs=-1, verbose=3, scoring = 'f1')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    6.2s remaining:    9.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.3s finished


In [7]:
rfc_cv_score.mean()

0.5215878443782369

# Random Forest Classifier

In [8]:
model_pipe_2 = imbPipeline(steps=[
    ('trans', transformer),
    ('smote', SMOTE(random_state=42)),
    ('rfc', RandomForestClassifier(verbose=1, n_jobs=-2))
])

In [9]:
params = {
    'rfc__max_depth': list(range(10,100,10)),
    'rfc__criterion': ['gini', 'entropy'],
    'rfc__n_estimators': list(range(100,250,25)),
    'rfc__min_samples_leaf': list(range(2,10,2)),
    'rfc__min_samples_split': list(range(2,10,2)),
}

In [10]:
gs_rfc = GridSearchCV(model_pipe_2, params, n_jobs=-1, verbose=3, cv = 3, scoring = 'f1')
gs_rfc.fit(X_train, y_train)

Fitting 3 folds for each of 1728 candidates, totalling 5184 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 488 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 1128 tasks      | elapsed: 17.9min
[Parallel(n_jobs=-1)]: Done 1544 tasks      | elapsed: 24.6min
[Parallel(n_jobs=-1)]: Done 2024 tasks      | elapsed: 32.3min
[Parallel(n_jobs=-1)]: Done 2568 tasks      | elapsed: 41.1min
[Parallel(n_jobs=-1)]: Done 3176 tasks      | elapsed: 51.3min
[Parallel(n_jobs=-1)]: Done 3848 tasks      | elapsed: 63.4min
[Parallel(n_jobs=-1)]: Done 4584 tasks      | elapsed: 76.3min
[Parallel(n_jobs=-1)]: Done 5184 out of 5184 | elapsed: 87.0min finished
[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 11 concurrent workers.
[Parallel(n_jobs=

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('trans',
                                        ColumnTransformer(transformers=[('categorical',
                                                                         Pipeline(steps=[('impute',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_...
      dtype='object'))])),
                        

In [11]:
gs_rfc.best_params_

{'rfc__criterion': 'gini',
 'rfc__max_depth': 10,
 'rfc__min_samples_leaf': 8,
 'rfc__min_samples_split': 6,
 'rfc__n_estimators': 200}

In [12]:
rfc_cv_1 = cross_val_score(gs_rfc, X_train, y_train, n_jobs=-1, verbose=3, scoring = 'f1')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 423.1min remaining: 634.6min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 424.6min finished


In [13]:
rfc_cv_1.mean()

0.5714167753683743

## Gradiant Boost Classifier

In [14]:
model_pipe_3 = imbPipeline(steps=[
    ('trans', transformer),
    ('smote', SMOTE(random_state=42)),
    ('gbc', GradientBoostingClassifier(verbose=3))
])

In [15]:
model_pipe_3.get_params()

{'memory': None,
 'steps': [('trans',
   ColumnTransformer(transformers=[('categorical',
                                    Pipeline(steps=[('impute',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('ohe',
                                                     OneHotEncoder(handle_unknown='ignore',
                                                                   sparse=False))]),
                                    Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
          'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region',
          'census_msa'],
         dtype='object')),
                                   ('numerical',
                                    P...
          'behavioral_large_gatherings', 'behavioral_outside_home',
          'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
          'chronic_med_condition', 'child_u

In [16]:
params = {
    'gbc__learning_rate': [0.001, 0.01, 0.1, .5, .9],
    'gbc__n_estimators': list(range(100,250,50)),
    'gbc__min_samples_leaf': list(range(2,10,2)),
    'gbc__min_samples_split': list(range(2,10,2)),
    'gbc__max_features': list(range(0,200,50))
}

In [17]:
gs_gbc = GridSearchCV(model_pipe_3, params, n_jobs=-1, verbose=1, cv = 3, scoring = 'f1')
gs_gbc.fit(X_train, y_train)

Fitting 3 folds for each of 960 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 14.0min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed: 20.1min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 29.9min
[Parallel(n_jobs=-1)]: Done 2426 tasks      | elapsed: 40.2min
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed: 49.5min finished


      Iter       Train Loss   Remaining Time 
         1           1.3786           15.38s
         2           1.3716           15.65s
         3           1.3645           16.07s
         4           1.3578           15.69s
         5           1.3507           15.33s
         6           1.3437           15.39s
         7           1.3368           15.04s
         8           1.3300           15.03s
         9           1.3235           15.01s
        10           1.3170           14.78s
        11           1.3106           14.76s
        12           1.3044           14.71s
        13           1.2983           14.66s
        14           1.2924           14.52s
        15           1.2865           14.37s
        16           1.2807           14.33s
        17           1.2749           14.28s
        18           1.2693           14.23s
        19           1.2638           14.25s
        20           1.2585           14.12s
        21           1.2532           14.07s
        2

       185           0.8783            1.16s
       186           0.8770            1.08s
       187           0.8758            1.01s
       188           0.8744            0.93s
       189           0.8734            0.85s
       190           0.8724            0.77s
       191           0.8708            0.70s
       192           0.8699            0.62s
       193           0.8689            0.54s
       194           0.8678            0.46s
       195           0.8667            0.39s
       196           0.8652            0.31s
       197           0.8641            0.23s
       198           0.8631            0.15s
       199           0.8623            0.08s
       200           0.8612            0.00s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('trans',
                                        ColumnTransformer(transformers=[('categorical',
                                                                         Pipeline(steps=[('impute',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_...
       'household_children'],
      dtype='object')

In [18]:
gs_gbc.best_params_

{'gbc__learning_rate': 0.01,
 'gbc__max_features': 50,
 'gbc__min_samples_leaf': 2,
 'gbc__min_samples_split': 4,
 'gbc__n_estimators': 200}

In [19]:
gbc_cv_1 = cross_val_score(gs_gbc, X_train, y_train, n_jobs=-1, verbose=3, scoring = 'f1')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 178.3min remaining: 267.5min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 180.0min finished


In [20]:
gbc_cv_1.mean()

0.5860809429492359