In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sys
import os

In [2]:
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

In [3]:
from keras.models import Sequential
from keras.layers import Dense

Set notebook properties

In [4]:
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
sns.set_style("darkgrid")

Set data path

In [5]:
DATA_PATH = r'../data_source'

In [6]:
training_set_features = pd.read_csv(os.path.join(DATA_PATH, 'training_set_features.csv'))
training_set_labels = pd.read_csv(os.path.join(DATA_PATH, 'training_set_labels.csv'))
test_set_features = pd.read_csv(os.path.join(DATA_PATH, 'test_set_features.csv'))

#### Process Features

In [8]:
def process_features(df):
    
    cols_to_process =  ['h1n1_concern', 'h1n1_knowledge',
                        'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
                        'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
                        'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
                        'education', 'race', 'sex', 'income_poverty', 'marital_status',
                        'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
                        'household_adults', 'household_children', 'employment_industry',
                        'employment_occupation']
    
    for i in cols_to_process:
        df[i] = [f'{i}_' + str(x)  for x in df[i]]
        
    concat_list = []
    for i in cols_to_process:
        concat_list.append(pd.get_dummies(df[i]))
        
    one_hot_encoded = pd.concat(concat_list, axis=1)
    df = df.drop(columns=cols_to_process)
    df_concatenated = pd.concat([df, one_hot_encoded], axis=1)
        
    return df_concatenated

In [10]:
X = process_features(training_set_features).iloc[:,1:].fillna(0)
y_h1n1 = training_set_labels['h1n1_vaccine']
y_seasonal = training_set_labels['seasonal_vaccine']
X_test = process_features(test_set_features).iloc[:,1:].fillna(0)

In [11]:
X.columns = ['col_' + str(x) for x in range(len(X.columns))]
X_test.columns = ['col_' + str(x) for x in range(len(X_test.columns))]

In [15]:
def hyperParameterTuning(X_train, y_train):
    param_tuning = {
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7, 10],
        'min_child_weight': [1, 3, 5]
    }
    
    xgb_model = XGBClassifier()

    gsearch = GridSearchCV(estimator = xgb_model,
                           param_grid = param_tuning,                        
                           cv = 5,
                           n_jobs = -1,
                           verbose = 1)

    gsearch.fit(X_train,y_train)

    return gsearch.best_params_

In [None]:
hyperParameterTuning(X, y_seasonal)

In [26]:
xgb_h1n1 = XGBClassifier(learning_rate=0.001, n_estimators=200, max_depth=4,
                         min_child_weight=5, gamma=1, subsample=0.8, colsample_bytree=0.8,
                         objective= 'binary:logistic', nthread=4, verbose=1, eval_metric='auc')
xgb_h1n1.fit(X, y_h1n1)

Parameters: { verbose } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eval_metric='auc',
              gamma=1, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.001, max_delta_step=0,
              max_depth=4, min_child_weight=5, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=4, nthread=4,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=0.8, tree_method='exact',
              validate_parameters=1, verbose=1, verbosity=None)

In [27]:
xgb_seasonal = XGBClassifier(learning_rate=0.001, n_estimators=200, max_depth=4,
                             min_child_weight=5, gamma=1, subsample=0.8, colsample_bytree=0.8,
                             objective= 'binary:logistic', nthread=4, verbose=1, eval_metric='auc')
xgb_seasonal.fit(X, y_seasonal)

Parameters: { verbose } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eval_metric='auc',
              gamma=1, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.001, max_delta_step=0,
              max_depth=4, min_child_weight=5, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=4, nthread=4,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=0.8, tree_method='exact',
              validate_parameters=1, verbose=1, verbosity=None)

In [28]:
probability_h1n1 = xgb_h1n1.predict_proba(X_test)
probability_h1n1 = [x[1] for x in probability_h1n1]

In [29]:
probability_seasonal = xgb_seasonal.predict_proba(X_test)
probability_seasonal = [x[1] for x in probability_seasonal]

In [30]:
submission = pd.DataFrame()
submission['respondent_id'] = test_set_features['respondent_id']
submission['h1n1_vaccine'] = probability_h1n1
submission['seasonal_vaccine'] = probability_seasonal

In [31]:
outpath = os.path.join(r'../output', 'xgboost2.csv')

In [32]:
submission.to_csv(outpath, index=False)