In [1]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, cross_val_score

import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
X_train = pd.read_csv('data/X_train_h1n1.csv', index_col= 'respondent_id')
X_test = pd.read_csv('data/X_test_h1n1.csv', index_col= 'respondent_id')
y_train = pd.read_csv('data/y_train_h1n1.csv', index_col= 'respondent_id')
y_test = pd.read_csv('data/y_test_h1h1.csv', index_col= 'respondent_id')

In [3]:
y_train = y_train['h1n1_vaccine']
y_test = y_test['h1n1_vaccine']

In [4]:
X_train_cat = X_train.select_dtypes('object')
X_train_num = X_train.select_dtypes(['float64', 'int64'])

cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])
num_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
])

transformer = ColumnTransformer([('categorical', cat_pipe, X_train_cat.columns),
                                 ('numerical', num_pipe, X_train_num.columns)])

In [5]:
model_pipe = imbPipeline(steps=[
    ('trans', transformer),
    ('smote', SMOTE(random_state=42)),
    ('rfc', RandomForestClassifier(verbose=1, n_jobs=-2))
])

model_pipe.fit(X_train, y_train)

[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  36 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:    0.8s finished


Pipeline(steps=[('trans',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region',
       'census_msa'],
      dtype=...
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'op

In [6]:
rfc_cv_score = cross_val_score(model_pipe, X_train, y_train, n_jobs=-1, verbose=3)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    6.8s remaining:   10.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.9s finished


In [7]:
rfc_cv_score

array([0.83449825, 0.8349975 , 0.83275087, 0.83799301, 0.83325012])

# Random Forest Classifier

In [8]:
model_pipe_2 = imbPipeline(steps=[
    ('trans', transformer),
    ('smote', SMOTE(random_state=42)),
    ('rfc', RandomForestClassifier(verbose=1, n_jobs=-2))
])

In [9]:
params = {
    'rfc__max_depth': list(range(10,100,10)),
    'rfc__criterion': ['gini', 'entropy'],
    'rfc__n_estimators': list(range(100,250,50)),
    'rfc__min_samples_leaf': list(range(2,10,2)),
    'rfc__min_samples_split': list(range(2,10,2)),
}

In [10]:
gs_rfc = GridSearchCV(model_pipe_2, params, n_jobs=-1, verbose=3, cv = 3)
gs_rfc.fit(X_train, y_train)

Fitting 3 folds for each of 864 candidates, totalling 2592 fits


[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  36 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:    1.5s finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('trans',
                                        ColumnTransformer(transformers=[('categorical',
                                                                         Pipeline(steps=[('impute',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_...
       'opinion_seas_risk', 'opinion_seas_sick_from

In [11]:
gs_rfc.best_params_

{'rfc__criterion': 'entropy',
 'rfc__max_depth': 40,
 'rfc__min_samples_leaf': 6,
 'rfc__min_samples_split': 8,
 'rfc__n_estimators': 100}

In [12]:
rfc_cv_1 = cross_val_score(gs_rfc, X_train, y_train, n_jobs=-1, verbose=3)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 223.7min remaining: 335.6min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 223.9min finished


In [13]:
rfc_cv_1

array([0.83349975, 0.8349975 , 0.83474788, 0.83674488, 0.83474788])

## Gradiant Boost Classifier

In [14]:
model_pipe_3 = imbPipeline(steps=[
    ('trans', transformer),
    ('smote', SMOTE(random_state=42)),
    ('gbc', GradientBoostingClassifier(verbose=3))
])

In [15]:
model_pipe_3.get_params()

{'memory': None,
 'steps': [('trans',
   ColumnTransformer(transformers=[('categorical',
                                    Pipeline(steps=[('impute',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('ohe',
                                                     OneHotEncoder(handle_unknown='ignore',
                                                                   sparse=False))]),
                                    Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
          'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region',
          'census_msa'],
         dtype='object')),
                                   ('numerical',
                                    P...
          'behavioral_large_gatherings', 'behavioral_outside_home',
          'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
          'chronic_med_condition', 'child_u

In [16]:
params = {
    'gbc__learning_rate': [0.001, 0.01, 0.1, .5, .9],
    'gbc__n_estimators': list(range(100,250,50)),
    'gbc__min_samples_leaf': list(range(2,10,2)),
    'gbc__min_samples_split': list(range(2,10,2)),
    'gbc__max_features': list(range(0,200,50))
}

In [17]:
gs_gbc = GridSearchCV(model_pipe_3, params, n_jobs=-1, verbose=1, cv = 3)
gs_gbc.fit(X_train, y_train)

Fitting 3 folds for each of 960 candidates, totalling 2880 fits


        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.79266145 0.79146278 0.79520703 0.79016473 0.77493819 0.78033052
 0.78881668 0.78362472 0.79425897 0.7755375  0.77733468 0.79081379
 0.79286055 0.77518782 0.78846722 0.77403958 0.79390878 0.77453881
 0.79061389 0.78921606 0.79540734 0.78936583 0.78736892 0.78946591
 0.77793359 0.78951559 0.77418935 0.77513788 0.78627103 0.79485791
 0.77548734 0.78976521 0.79735418 0.78801806 0.78756861 0.79001482
 0.78222689 0.79355933 0.79425839 0.79001482 0.77833328 0.78906629
 0.79216149 0.78841741 0.79395944 0.787319   0.77648593 0.7901

      Iter       Train Loss   Remaining Time 
         1           1.3150           26.04s
         2           1.2528           24.91s
         3           1.2028           23.67s
         4           1.1592           22.98s
         5           1.1200           22.73s
         6           1.0883           22.24s
         7           1.0610           21.39s
         8           1.0380           21.13s
         9           1.0168           20.74s
        10           0.9966           20.38s
        11           0.9786           20.15s
        12           0.9616           19.75s
        13           0.9452           19.52s
        14           0.9302           18.95s
        15           0.9160           18.64s
        16           0.9047           18.35s
        17           0.8868           18.26s
        18           0.8759           18.32s
        19           0.8645           18.40s
        20           0.8537           18.18s
        21           0.8413           17.87s
        2

       185           0.4815            1.34s
       186           0.4812            1.25s
       187           0.4808            1.16s
       188           0.4802            1.07s
       189           0.4799            0.98s
       190           0.4795            0.89s
       191           0.4791            0.80s
       192           0.4789            0.71s
       193           0.4785            0.62s
       194           0.4781            0.54s
       195           0.4779            0.45s
       196           0.4777            0.36s
       197           0.4772            0.27s
       198           0.4768            0.18s
       199           0.4766            0.09s
       200           0.4764            0.00s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('trans',
                                        ColumnTransformer(transformers=[('categorical',
                                                                         Pipeline(steps=[('impute',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_...
       'opinion_seas_risk', 'opinion_seas_sick_from

In [18]:
gs_gbc.best_params_

{'gbc__learning_rate': 0.1,
 'gbc__max_features': 50,
 'gbc__min_samples_leaf': 6,
 'gbc__min_samples_split': 8,
 'gbc__n_estimators': 200}

In [19]:
gbc_cv_1 = cross_val_score(gs_gbc, X_train, y_train, n_jobs=-1, verbose=3)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 159.1min remaining: 238.6min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 159.2min finished


In [20]:
gbc_cv_1

array([0.83849226, 0.83724413, 0.8349975 , 0.83924114, 0.83924114])