In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline as imbPipeline

from xgboost import XGBClassifier
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
X_train = pd.read_csv('../data/X_train_h1n1.csv', index_col= 'respondent_id')
X_test = pd.read_csv('../data/X_test_h1n1.csv', index_col= 'respondent_id')
y_train = pd.read_csv('../data/y_train_h1n1.csv', index_col= 'respondent_id')
y_test = pd.read_csv('../data/y_test_h1h1.csv', index_col= 'respondent_id')

In [3]:
y_train = y_train['h1n1_vaccine']
y_test = y_test['h1n1_vaccine']

In [4]:
X_train_cat = X_train.select_dtypes('object')
X_train_num = X_train.select_dtypes(['float64', 'int64'])

cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])
num_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
])

transformer = ColumnTransformer([('categorical', cat_pipe, X_train_cat.columns),
                                 ('numerical', num_pipe, X_train_num.columns)])

# XGBoost

In [5]:
model_pipe = imbPipeline(steps=[
    ('trans', transformer),
    ('smote', SMOTE(random_state=42)),
    ('xgboost', XGBClassifier(n_jobs=-1, random_state=42))
])

In [6]:
cv_XGBoost1 = cross_val_score(model_pipe, X_train, y_train, n_jobs=-1, verbose=2, scoring='f1')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   14.0s remaining:   21.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.4s finished


In [7]:
cv_XGBoost1.mean()

0.5291627008368801

# Grid Search on XGBoost

In [8]:
model_pipe_2 = imbPipeline(steps=[
    ('trans', transformer),
    ('smote', SMOTE(random_state=42)),
    ('xgboost', XGBClassifier(n_jobs=-1, random_state=42))
])

In [9]:
model_pipe_2.get_params()

{'memory': None,
 'steps': [('trans',
   ColumnTransformer(transformers=[('categorical',
                                    Pipeline(steps=[('impute',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('ohe',
                                                     OneHotEncoder(handle_unknown='ignore',
                                                                   sparse=False))]),
                                    Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
          'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region',
          'census_msa'],
         dtype='object')),
                                   ('numerical',
                                    P...
          'behavioral_large_gatherings', 'behavioral_outside_home',
          'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
          'chronic_med_condition', 'child_u

In [10]:
params = {
        'xgboost__min_child_weight': [1, 5, 10],
        'xgboost__gamma': [0.5, 1, 1.5, 2, 5],
        'xgboost__subsample': [0.6, 0.8, 1.0],
        'xgboost__colsample_bytree': [0.6, 0.8, 1.0],
        'xgboost__max_depth': [3, 4, 5]
        }

In [11]:
gs_XGBoost = GridSearchCV(model_pipe_2, param_grid=params, n_jobs=-1, verbose=2, return_train_score=True, scoring='f1')

In [12]:
gs_XGBoost.fit(X_train, y_train)

Fitting 5 folds for each of 405 candidates, totalling 2025 fits






GridSearchCV(estimator=Pipeline(steps=[('trans',
                                        ColumnTransformer(transformers=[('categorical',
                                                                         Pipeline(steps=[('impute',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_regio...
                                                      reg_lambda=

In [13]:
gs_XGBoost.best_params_

{'xgboost__colsample_bytree': 1.0,
 'xgboost__gamma': 5,
 'xgboost__max_depth': 5,
 'xgboost__min_child_weight': 10,
 'xgboost__subsample': 1.0}

In [14]:
cv_XGBoost = cross_val_score(gs_XGBoost, X_train, y_train, n_jobs=-1, verbose=3, scoring='f1')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 209.4min remaining: 314.1min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 209.4min finished


In [15]:
cv_XGBoost.mean()

0.5449053159381265