In [240]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import log_loss, confusion_matrix, plot_confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score, plot_roc_curve
from sklearn.model_selection import cross_validate, cross_val_score

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import OneHotEncoder

# from ipynb.fs.full.Functions import factorial

RANDOM_SEED = 1235846    # Set a random seed for reproducibility!

In [273]:
def model_predictions(model, x_train, y_train, x_test=None, y_test=None):

    '''Enter model name and test/train sets to generate predictions, precision, recall, accuracy, and F1 score'''
    model.fit(x_train, y_train)
    y_hat_train = model.predict(x_train)
    if x_test is not None:
        y_hat_test = model.predict(x_test)
    print('Training Precision: ', precision_score(y_train, y_hat_train))
    if x_test is not None:
        print('Testing Precision: ', precision_score(y_test, y_hat_test))
    print('-----')

    print('Training Recall: ', recall_score(y_train, y_hat_train))
    if x_test is not None:
        print('Testing Recall: ', recall_score(y_test, y_hat_test))
    print('-----')

    print('Training Accuracy: ', accuracy_score(y_train, y_hat_train))
    if x_test is not None:
        print('Testing Accuracy: ', accuracy_score(y_test, y_hat_test))
    print('-----')

    print('Training F1-Score: ', f1_score(y_train, y_hat_train))
    if x_test is not None:
        print('Testing F1-Score: ', f1_score(y_test, y_hat_test))
    return precision_score(y_test, y_hat_test), accuracy_score(y_test, y_hat_test)

In [246]:
features_df = pd.read_csv('./data/flu_training_set_features.csv', index_col='respondent_id')
labels_df = labels_df = pd.read_csv("./data/flu_training_set_labels.csv", index_col="respondent_id")

In [247]:
X_train, X_test, y_train, y_test = train_test_split(
    features_df,
    labels_df,
    test_size=0.25,
    random_state=RANDOM_SEED
)

In [248]:
drop_features = ['hhs_geo_region', 'employment_industry','employment_occupation']
X_train.drop(drop_features, axis=1, inplace=True)
X_test.drop(drop_features, axis=1, inplace=True)
numeric_features = [col for col in X_train.columns if X_train[col].dtype != 'O']
categorical_features = [col for col in X_train.columns if X_train[col].dtype == 'O']

In [249]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median", add_indicator=True))]
)

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [241]:
models = {'Baseline':
              {'regressor': DummyRegressor(),
               'preprocessor': preprocessor},
          'Logistic':
              {'regressor': LogisticRegression(),
               'preprocessor': preprocessor},
          'Naive_Bayes':
              {'regressor': GaussianNB(),
               'preprocessor': preprocessor},
          'Decision_Trees':
              {'regressor': DecisionTreeClassifier(),
               'preprocessor': preprocessor},
          'KNN':
              {'regressor': KNeighborsClassifier(),
               'preprocessor': preprocessor},
          'Random_Forest':
              {'regressor': RandomForestClassifier(),
               'preprocessor': preprocessor},
          'G_Boost':
              {'regressor': GradientBoostingClassifier(),
               'preprocessor': preprocessor},
          'Hist_Boost':
              {'regressor': HistGradientBoostingClassifier(),
               'preprocessor': preprocessor}
              }

In [274]:
count = 0
precision_dict = {}
accuracy_dict = {}
for name, model in models.items():  
    print(name)
    X_train_processed = model['preprocessor'].fit_transform(X_train)
    X_test_processed = model['preprocessor'].transform(X_test)
    model['fit_regressor'] = model['regressor'].fit(X_train_processed, y_train.seasonal_vaccine)
    # model['output'] = model['fit_regressor'].score(X_test_processed, y_test.seasonal_vaccine)
    print(model)
    if count > 0:
        precision_dict[name], accuracy_dict[name] = model_predictions(model['fit_regressor'], X_train_processed, y_train.seasonal_vaccine, X_test_processed, y_test.seasonal_vaccine)
    count += 1

Baseline
{'regressor': DummyRegressor(), 'preprocessor': ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(add_indicator=True,
                                                                strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['h1n1_concern', 'h1n1_knowledge',
                                  'behavioral_antiviral_meds',
                                  'behavioral_avoidance',
                                  'behavioral_face_mask',
                                  'behavioral_wash_hands',
                                  'behavioral_large_gatherings',
                                  'behavioral_outside_home',
                                  'behavioral_to...
                                  'opinion_h1n1_vacc_effective',
                                  'opinion_

In [275]:
precision_dict

{'Logistic': 0.7753647777400746,
 'Naive_Bayes': 0.6257470119521913,
 'Decision_Trees': 0.6565428109854604,
 'KNN': 0.7050599201065246,
 'Random_Forest': 0.7763741891430522,
 'G_Boost': 0.7829744279946165,
 'Hist_Boost': 0.7785547785547785}

In [276]:
accuracy_dict

{'Logistic': 0.7769956567320653,
 'Naive_Bayes': 0.6851879586640707,
 'Decision_Trees': 0.679047476411562,
 'KNN': 0.7184364235435076,
 'Random_Forest': 0.7763965852927962,
 'G_Boost': 0.785831960461285,
 'Hist_Boost': 0.7844840497229295}

## Tuning Naive Bayes

In [293]:
# Grid_search for Naive Bayes
pipeline_nb = Pipeline([( "preprocessor" , preprocessor),
                       ("nb",GaussianNB())])

parameters_nb = {'nb__var_smoothing': np.logspace(0,-9, num=10)}

gs_nb=GridSearchCV(pipeline_nb,parameters_nb)

gs_nb.fit(X_train,y_train.seasonal_vaccine)

gs_nb.best_params_

{'nb__var_smoothing': 1.0}

In [None]:
# nb_tuned = GaussianNB(var_smoothing=1.0)
# precision_dict[name], accuracy_dict[name] = model_predictions(nb_tuned, X_train_processed, y_train.seasonal_vaccine, X_test_processed, y_test.seasonal_vaccine)

In [325]:
# def best_param_model(model_name, regressor, parameters, X_train, y_train, X_test, y_test):
#     model_tuned = {model_name:
#               {'regressor': regressor(),
#                'preprocessor': preprocessor}}
                     
#     print(list(model_tuned.keys())[0])
#     X_train_processed = model_tuned[model_name]['preprocessor'].fit_transform(X_train)
#     X_test_processed = model_tuned[model_name]['preprocessor'].transform(X_test)
#     model_tuned[model_name]['fit_regressor'] = model_tuned[model_name]['regressor'].fit(X_train_processed, y_train.seasonal_vaccine)
#     # model['output'] = model['fit_regressor'].score(X_test_processed, y_test.seasonal_vaccine)
#     print(model_tuned)
#     precision_dict[name], accuracy_dict[name] = model_predictions(model_tuned[model_name]['fit_regressor'], X_train_processed, y_train.seasonal_vaccine, X_test_processed, y_test.seasonal_vaccine)
    

In [329]:
precision_dict_tuned = {}
accuracy_dict_tuned = {}

In [326]:
model_nb_tuned = {'Naive_Bayes':
              {'regressor': GaussianNB(var_smoothing=1.0),
               'preprocessor': preprocessor}}
 
print(list(model_nb_tuned.keys())[0])
X_train_processed = model_nb_tuned['Naive_Bayes']['preprocessor'].fit_transform(X_train)
X_test_processed = model_nb_tuned['Naive_Bayes']['preprocessor'].transform(X_test)
model_nb_tuned['Naive_Bayes']['fit_regressor'] = model_nb_tuned['Naive_Bayes']['regressor'].fit(X_train_processed, y_train.seasonal_vaccine)
# model['output'] = model['fit_regressor'].score(X_test_processed, y_test.seasonal_vaccine)
print(model_nb_tuned)
precision_dict_tuned[name], accuracy_dic_tunedt[name] = model_predictions(model_nb_tuned['Naive_Bayes']['fit_regressor'], X_train_processed, y_train.seasonal_vaccine, X_test_processed, y_test.seasonal_vaccine)


Naive_Bayes
{'Naive_Bayes': {'regressor': GaussianNB(var_smoothing=1.0), 'preprocessor': ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(add_indicator=True,
                                                                strategy='median'))]),
                                 ['h1n1_concern', 'h1n1_knowledge',
                                  'behavioral_antiviral_meds',
                                  'behavioral_avoidance',
                                  'behavioral_face_mask',
                                  'behavioral_wash_hands',
                                  'behavioral_large_gatherings',
                                  'behavioral_outside_home',
                                  'behavioral_touch_face', 'doctor_recc_h1n1'...
                                  'opinion_h1n1_vacc_effective',
                                  'opinion_h1n1_risk',
      

## Tuning Random Forest

In [327]:
# Grid_search for Random Forest
pipeline_rfc = Pipeline([( "preprocessor" , preprocessor),
                       ("rfc",RandomForestClassifier())])

parameters_rfc = {}
parameters_rfc['rfc__criterion'] = ['gini', 'entropy']
parameters_rfc['rfc__min_samples_leaf'] = [1, 5, 10]
parameters_rfc['rfc__max_depth'] = [1, 3, 5, 10, 15, 25]

gs_rfc=GridSearchCV(pipeline_rfc,parameters_rfc)

gs_rfc.fit(X_train,y_train.seasonal_vaccine)

gs_rfc.best_params_

{'rfc__criterion': 'gini', 'rfc__max_depth': 15, 'rfc__min_samples_leaf': 5}

In [331]:
model_rfc_tuned = {'Random_Forest':
              {'regressor': RandomForestClassifier(criterion='gini', max_depth=15, min_samples_leaf=5),
               'preprocessor': preprocessor}}
 
print(list(model_rfc_tuned.keys())[0])
X_train_processed = model_rfc_tuned['Random_Forest']['preprocessor'].fit_transform(X_train)
X_test_processed = model_rfc_tuned['Random_Forest']['preprocessor'].transform(X_test)
model_rfc_tuned['Random_Forest']['fit_regressor'] = model_rfc_tuned['Random_Forest']['regressor'].fit(X_train_processed, y_train.seasonal_vaccine)
# model['output'] = model['fit_regressor'].score(X_test_processed, y_test.seasonal_vaccine)
print(model_rfc_tuned)
precision_dict_tuned[name], accuracy_dict_tuned[name] = model_predictions(model_rfc_tuned['Random_Forest']['fit_regressor'], X_train_processed, y_train.seasonal_vaccine, X_test_processed, y_test.seasonal_vaccine)


Random_Forest
{'Random_Forest': {'regressor': RandomForestClassifier(max_depth=15, min_samples_leaf=5), 'preprocessor': ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(add_indicator=True,
                                                                strategy='median'))]),
                                 ['h1n1_concern', 'h1n1_knowledge',
                                  'behavioral_antiviral_meds',
                                  'behavioral_avoidance',
                                  'behavioral_face_mask',
                                  'behavioral_wash_hands',
                                  'behavioral_large_gatherings',
                                  'behavioral_outside_home',
                                  'behavioral_touch_face', 'doctor_recc_h1n1'...
                                  'opinion_h1n1_vacc_effective',
                              

## Tuning Random Forest #2

In [337]:
# Grid_search for Random Forest #2
pipeline_rfc = Pipeline([( "preprocessor" , preprocessor),
                       ("rfc",RandomForestClassifier(criterion='gini', min_samples_leaf=5))])

parameters_rfc = {}
parameters_rfc['rfc__max_depth'] = [3, 5]
parameters_rfc['rfc__ccp_alpha'] = [0, 0.1, 0.5, 1, 2]
parameters_rfc['rfc__n_estimators'] = [50, 100, 250, 500]


gs_rfc=GridSearchCV(pipeline_rfc,parameters_rfc)

gs_rfc.fit(X_train,y_train.seasonal_vaccine)

gs_rfc.best_params_

{'rfc__ccp_alpha': 0, 'rfc__max_depth': 5, 'rfc__n_estimators': 500}

In [343]:
model_rfc_tuned = {'Random_Forest':
              {'regressor': RandomForestClassifier(criterion='gini', max_depth=5, min_samples_leaf=5, n_estimators=500),
               'preprocessor': preprocessor}}
 
print(list(model_rfc_tuned.keys())[0])
X_train_processed = model_rfc_tuned['Random_Forest']['preprocessor'].fit_transform(X_train)
X_test_processed = model_rfc_tuned['Random_Forest']['preprocessor'].transform(X_test)
model_rfc_tuned['Random_Forest']['fit_regressor'] = model_rfc_tuned['Random_Forest']['regressor'].fit(X_train_processed, y_train.seasonal_vaccine)
# model['output'] = model['fit_regressor'].score(X_test_processed, y_test.seasonal_vaccine)
print(model_rfc_tuned)
precision_dict_tuned['Random_Forest'], accuracy_dict_tuned['Random_Forest'] = model_predictions(model_rfc_tuned['Random_Forest']['fit_regressor'], X_train_processed, y_train.seasonal_vaccine, X_test_processed, y_test.seasonal_vaccine)


Random_Forest
{'Random_Forest': {'regressor': RandomForestClassifier(max_depth=7, min_samples_leaf=5, n_estimators=300), 'preprocessor': ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(add_indicator=True,
                                                                strategy='median'))]),
                                 ['h1n1_concern', 'h1n1_knowledge',
                                  'behavioral_antiviral_meds',
                                  'behavioral_avoidance',
                                  'behavioral_face_mask',
                                  'behavioral_wash_hands',
                                  'behavioral_large_gatherings',
                                  'behavioral_outside_home',
                                  'behavioral_touch_face', 'doctor_recc_h1n1'...
                                  'opinion_h1n1_vacc_effective',
             

## Tuning Gradient Boosting

In [348]:
# Grid_search for Gradient Boosting
pipeline_gbc = Pipeline([( "preprocessor" , preprocessor),
                       ("gbc",GradientBoostingClassifier(n_estimators=100, learning_rate=0.1))])

parameters_gbc = {}
parameters_gbc['gbc__min_samples_leaf'] = [1, 5, 10]
parameters_gbc['gbc__max_depth'] = [3, 5, 10, 15, 20]

gs_gbc=GridSearchCV(pipeline_gbc,parameters_gbc)

gs_gbc.fit(X_train,y_train.seasonal_vaccine)

gs_gbc.best_params_

{'gbc__max_depth': 5, 'gbc__min_samples_leaf': 5}

In [349]:
model_gbc_tuned = {'Gradient_Boosting':
              {'regressor': GradientBoostingClassifier(min_samples_leaf=5, max_depth=5),
               'preprocessor': preprocessor}}
 
print(list(model_gbc_tuned.keys())[0])
X_train_processed = model_gbc_tuned['Gradient_Boosting']['preprocessor'].fit_transform(X_train)
X_test_processed = model_gbc_tuned['Gradient_Boosting']['preprocessor'].transform(X_test)
model_gbc_tuned['Gradient_Boosting']['fit_regressor'] = model_gbc_tuned['Gradient_Boosting']['regressor'].fit(X_train_processed, y_train.seasonal_vaccine)
# model['output'] = model['fit_regressor'].score(X_test_processed, y_test.seasonal_vaccine)
print(model_gbc_tuned)
precision_dict_tuned['Gradient_Boosting'], accuracy_dict_tuned['Gradient_Boosting'] = model_predictions(model_gbc_tuned['Gradient_Boosting']['fit_regressor'], X_train_processed, y_train.seasonal_vaccine, X_test_processed, y_test.seasonal_vaccine)


Gradient_Boosting
{'Gradient_Boosting': {'regressor': GradientBoostingClassifier(max_depth=5, min_samples_leaf=5), 'preprocessor': ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(add_indicator=True,
                                                                strategy='median'))]),
                                 ['h1n1_concern', 'h1n1_knowledge',
                                  'behavioral_antiviral_meds',
                                  'behavioral_avoidance',
                                  'behavioral_face_mask',
                                  'behavioral_wash_hands',
                                  'behavioral_large_gatherings',
                                  'behavioral_outside_home',
                                  'behavioral_touch_face', 'doctor_recc_h1n1'...
                                  'opinion_h1n1_vacc_effective',
                   

## Tuning Hist Boost

In [350]:
# Grid_search for Hist Boost
pipeline_hbc = Pipeline([( "preprocessor" , preprocessor),
                       ("hbc",HistGradientBoostingClassifier(learning_rate=0.1, max_iter=100,))])

parameters_hbc = {}
parameters_hbc['hbc__min_samples_leaf'] = [1, 5, 10]
parameters_hbc['hbc__max_depth'] = [3, 5, 10, 15, 20]

gs_hbc=GridSearchCV(pipeline_hbc,parameters_hbc)

gs_hbc.fit(X_train,y_train.seasonal_vaccine)

gs_hbc.best_params_

{'hbc__max_depth': 10, 'hbc__min_samples_leaf': 10}

In [351]:
model_hbc_tuned = {'Hist_Boosting':
              {'regressor': HistGradientBoostingClassifier(max_depth=10, min_samples_leaf=10),
               'preprocessor': preprocessor}}
 
print(list(model_hbc_tuned.keys())[0])
X_train_processed = model_hbc_tuned['Hist_Boosting']['preprocessor'].fit_transform(X_train)
X_test_processed = model_hbc_tuned['Hist_Boosting']['preprocessor'].transform(X_test)
model_hbc_tuned['Hist_Boosting']['fit_regressor'] = model_hbc_tuned['Hist_Boosting']['regressor'].fit(X_train_processed, y_train.seasonal_vaccine)
# model['output'] = model['fit_regressor'].score(X_test_processed, y_test.seasonal_vaccine)
print(model_hbc_tuned)
precision_dict_tuned['Hist_Boosting'], accuracy_dict_tuned['Hist_Boosting'] = model_predictions(model_hbc_tuned['Hist_Boosting']['fit_regressor'], X_train_processed, y_train.seasonal_vaccine, X_test_processed, y_test.seasonal_vaccine)


Hist_Boosting
{'Hist_Boosting': {'regressor': HistGradientBoostingClassifier(max_depth=10, min_samples_leaf=10), 'preprocessor': ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(add_indicator=True,
                                                                strategy='median'))]),
                                 ['h1n1_concern', 'h1n1_knowledge',
                                  'behavioral_antiviral_meds',
                                  'behavioral_avoidance',
                                  'behavioral_face_mask',
                                  'behavioral_wash_hands',
                                  'behavioral_large_gatherings',
                                  'behavioral_outside_home',
                                  'behavioral_touch_face', 'doctor_recc_h1n1'...
                                  'opinion_h1n1_vacc_effective',
                     

In [352]:
precision_dict

{'Logistic': 0.7753647777400746,
 'Naive_Bayes': 0.6257470119521913,
 'Decision_Trees': 0.6565428109854604,
 'KNN': 0.7050599201065246,
 'Random_Forest': 0.7763741891430522,
 'G_Boost': 0.7829744279946165,
 'Hist_Boost': 0.7797476986021139}

In [353]:
accuracy_dict

{'Logistic': 0.7769956567320653,
 'Naive_Bayes': 0.6851879586640707,
 'Decision_Trees': 0.679047476411562,
 'KNN': 0.7184364235435076,
 'Random_Forest': 0.7763965852927962,
 'G_Boost': 0.785831960461285,
 'Hist_Boost': 0.7796914782087764}

In [354]:
precision_dict_tuned

{'Hist_Boost': 0.7832494608195543,
 'Random_Forest': 0.7846374730796841,
 'Gradient_Boosting': 0.7825210084033614,
 'Hist_Boosting': 0.7789543348775645}

In [355]:
accuracy_dict_tuned

{'Hist_Boost': 0.769956567320653,
 'Random_Forest': 0.7714542459188258,
 'Gradient_Boosting': 0.7856821926014678,
 'Hist_Boosting': 0.7864310319005542}