In [191]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import log_loss, confusion_matrix, plot_confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score, plot_roc_curve
from sklearn.model_selection import cross_validate, cross_val_score

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import OneHotEncoder

# from ipynb.fs.full.Functions import factorial

RANDOM_SEED = 1235846    # Set a random seed for reproducibility!

In [2]:
def model_predictions(model, x_train, y_train, x_test=None, y_test=None):

    '''Enter model name and test/train sets to generate predictions, precision, recall, accuracy, and F1 score'''
    model.fit(x_train, y_train)
    y_hat_train = model.predict(x_train)
    if x_test:
        y_hat_test = model.predict(x_test)
    print('Training Precision: ', precision_score(y_train, y_hat_train))
    if x_test:
        print('Testing Precision: ', precision_score(y_test, y_hat_test))
    print('-----')

    print('Training Recall: ', recall_score(y_train, y_hat_train))
    if x_test:
        print('Testing Recall: ', recall_score(y_test, y_hat_test))
    print('-----')

    print('Training Accuracy: ', accuracy_score(y_train, y_hat_train))
    if x_test:
        print('Testing Accuracy: ', accuracy_score(y_test, y_hat_test))
    print('-----')

    print('Training F1-Score: ', f1_score(y_train, y_hat_train))
    if x_test:
        print('Testing F1-Score: ', f1_score(y_test, y_hat_test))

In [182]:
features_df = pd.read_csv('./data/flu_training_set_features.csv', index_col='respondent_id')
labels_df = labels_df = pd.read_csv("./data/flu_training_set_labels.csv", index_col="respondent_id")

In [183]:
X_train, X_test, y_train, y_test = train_test_split(
    features_df,
    labels_df,
    test_size=0.25,
    random_state=RANDOM_SEED
)

In [184]:
drop_features = ['hhs_geo_region', 'employment_industry','employment_occupation']
X_train.drop(drop_features, axis=1, inplace=True)
X_test.drop(drop_features, axis=1, inplace=True)
numeric_features = [col for col in X_train.columns if X_train[col].dtype != 'O']
categorical_features = [col for col in X_train.columns if X_train[col].dtype == 'O']

In [192]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median", add_indicator=True)), ("scaler", StandardScaler())]
)

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [237]:
models = {'Baseline':
              {'regressor': DummyRegressor(),
               'preprocessor': preprocessor},
          'Logistic':
              {'regressor': LogisticRegression(),
               'preprocessor': preprocessor},
          'Naive_Bayes':
              {'regressor': GaussianNB(),
               'preprocessor': preprocessor},
          'Random_Forest':
              {'regressor': RandomForestClassifier(),
               'preprocessor': preprocessor}
              }

In [238]:
count = 0
for name, model in models.items():  
    print(name)
    X_train_processed = model['preprocessor'].fit_transform(X_train)
    X_test_processed = model['preprocessor'].transform(X_test)
    model['fit_regressor'] = model['regressor'].fit(X_train_processed, y_train.seasonal_vaccine)
    model['output'] = model['fit_regressor'].score(X_test_processed, y_test.seasonal_vaccine)
    print(model)
    if count > 0:
        model_predictions(model['fit_regressor'], X_train_processed, y_train.seasonal_vaccine)
    count += 1

Baseline
{'regressor': DummyRegressor(), 'preprocessor': ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(add_indicator=True,
                                                                strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['h1n1_concern', 'h1n1_knowledge',
                                  'behavioral_antiviral_meds',
                                  'behavioral_avoidance',
                                  'behavioral_face_mask',
                                  'behavioral_wash_hands',
                                  'behavioral_large_gatherings',
                                  'behavioral_outside_home',
                                  'behavioral_to...
                                  'opinion_h1n1_vacc_effective',
                                  'opinion_