## Model Strategies

For the problem at hand, the most appropriate strategy was to fit a number of classification models to classify observations into vaccinated or un-vaccinated.

The following models were fit, tested, and tuned to produce the most accurate model possible based on F-1 score: neural network model, logistic regression, SVM, gradient boosting classifier, Gaussian Naive Bayes, KNN, Random Forest, and a boosted classifier.

## Loading Packages

In [30]:
#Standard data analytical libraries
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime as dt
import os, warnings, time, dmba
import scikitplot as skplt 

#Data Mining Book Libraries
from dmba import liftChart, gainsChart,regressionSummary, classificationSummary, exhaustive_search
from dmba import backward_elimination, forward_selection, stepwise_selection, adjusted_r2_score, AIC_score, BIC_score
from os.path import exists
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, roc_curve, auc, roc_auc_score, plot_confusion_matrix,confusion_matrix,r2_score
#Classification 
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import Perceptron, LogisticRegression,  LinearRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier, kneighbors_graph
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

import imblearn
from imblearn.over_sampling import SMOTE

# Used to save keystrokes when wanting to print something. Now we can just use
# p("Hello") instead of print("Hello")
p = print
# import csv
# import re

# Change this value if you are not using o_desktop
computer = 'o_desktop'
#computer = 'other'
if (computer == 'o_desktop'):
    os.environ['NUMEXPR_MAX_THREADS'] = '24'
else:
    # default is 4 or 8
    os.environ['NUMEXPR_MAX_THREADS'] = '8'

# For future use:
# import threading
# import multiprocessing

## Loading Data

In [2]:
# Setting directories and loading training set and training labels
repo_directory = r'C:/ADS_599_Final/'
data_folder_directory = r'C:/ADS_599_Final/Data_Folder/'
df_features_file = 'C:/ADS_599_Final/Data_Folder/training_set_features.csv'
df_labels_file = 'C:/ADS_599_Final/Data_Folder/training_set_labels.csv'
df = pd.read_csv(df_features_file)
df_labels = pd.read_csv(df_labels_file)

# Combining training data with training labels for modeling
df = df.join(df_labels.set_index('respondent_id'), on='respondent_id')

df.head(5)

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_vaccine,seasonal_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0,1
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,,0,1
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0,0


## Handling Null Values

In [3]:
# Renaming df
df_train = df

# Categories
df_train['h1n1_concern'] = df_train['h1n1_concern'].fillna(-1)
df_train['h1n1_knowledge'] = df_train['h1n1_knowledge'].fillna(-1)
df_train['behavioral_antiviral_meds'] = df_train['behavioral_antiviral_meds'].fillna(-1)
df_train['behavioral_avoidance'] = df_train['behavioral_avoidance'].fillna(-1)
df_train['behavioral_face_mask'] = df_train['behavioral_face_mask'].fillna(-1)
df_train['behavioral_large_gatherings'] = df_train['behavioral_large_gatherings'].fillna(-1)
df_train['behavioral_outside_home'] = df_train['behavioral_outside_home'].fillna(-1)
df_train['behavioral_wash_hands'] = df_train['behavioral_wash_hands'].fillna(-1)          
df_train['behavioral_touch_face'] = df_train['behavioral_touch_face'].fillna(-1)
df_train['doctor_recc_h1n1'] = df_train['doctor_recc_h1n1'].fillna(-1)
df_train['doctor_recc_seasonal'] = df_train['doctor_recc_seasonal'].fillna(-1)
df_train['chronic_med_condition'] = df_train['chronic_med_condition'].fillna(-1)
df_train['child_under_6_months'] = df_train['child_under_6_months'].fillna(-1)
df_train['health_worker'] = df_train['health_worker'].fillna(-1)
df_train['health_insurance'] = df_train['health_insurance'].fillna(-1)
df_train['opinion_h1n1_vacc_effective'] = df_train['opinion_h1n1_vacc_effective'].fillna(-1)
df_train['opinion_h1n1_sick_from_vacc'] = df_train['opinion_h1n1_sick_from_vacc'].fillna(-1)
df_train['opinion_h1n1_risk'] = df_train['opinion_h1n1_risk'].fillna(-1)
df_train['opinion_seas_vacc_effective'] = df_train['opinion_seas_vacc_effective'].fillna(-1)
df_train['opinion_seas_risk'] = df_train['opinion_seas_risk'].fillna(-1)
df_train['opinion_seas_sick_from_vacc'] = df_train['opinion_seas_sick_from_vacc'].fillna(-1)
df_train['household_adults'] = df_train['household_adults'].fillna(-1)
df_train['household_children'] = df_train['household_children'].fillna(-1)

# Numbers
df_train['age_group'] = df_train['age_group'].fillna("no_response")
df_train['education'] = df_train['education'].fillna("no_response")
df_train['race'] = df_train['race'].fillna("no_response")
df_train['income_poverty'] = df_train['income_poverty'].fillna("no_response")
df_train['marital_status'] = df_train['marital_status'].fillna("no_response")
df_train['rent_or_own'] = df_train['rent_or_own'].fillna("no_response")
df_train['employment_status'] = df_train['employment_status'].fillna("no_response")
df_train['employment_occupation'] = df_train['employment_occupation'].fillna("no_response")
df_train['employment_industry'] = df_train['employment_industry'].fillna("no_response")

In [4]:
# Label encoding
df_train_label = df_train
    # Encode labels the below is equivalent to df_train['hhs_geo_region']= label_encoder.fit_transform(df_train['hhs_geo_region'])
df_train_label["hhs_geo_region"] = df_train["hhs_geo_region"].astype('category')
df_train_label["hhs_geo_region"] = df_train["hhs_geo_region"].cat.codes
df_train_label["census_msa"] = df_train["census_msa"].astype('category')
df_train_label["census_msa"] = df_train["census_msa"].cat.codes
df_train_label["employment_industry"] = df_train["employment_industry"].astype('category')
df_train_label["employment_industry"] = df_train["employment_industry"].cat.codes
df_train_label["employment_occupation"] = df_train["employment_occupation"].astype('category')
df_train_label["employment_occupation"] = df_train["employment_occupation"].cat.codes
df_train_label["employment_status"] = df_train["employment_status"].astype('category')
df_train_label["employment_status"] = df_train["employment_status"].cat.codes
df_train_label["rent_or_own"] = df_train["rent_or_own"].astype('category')
df_train_label["rent_or_own"] = df_train["rent_or_own"].cat.codes
df_train_label["marital_status"] = df_train["marital_status"].astype('category')
df_train_label["marital_status"] = df_train["marital_status"].cat.codes
df_train_label["income_poverty"] = df_train["income_poverty"].astype('category')
df_train_label["income_poverty"] = df_train["income_poverty"].cat.codes
df_train_label["race"] = df_train["race"].astype('category')
df_train_label["race"] = df_train["race"].cat.codes
df_train_label["education"] = df_train["education"].astype('category')
df_train_label["education"] = df_train["education"].cat.codes
df_train_label["age_group"] = df_train["age_group"].astype('category')
df_train_label["age_group"] = df_train["age_group"].cat.codes
df_train_label["sex"] = df_train["sex"].astype('category')
df_train_label["sex"] = df_train["sex"].cat.codes

p("After encoding the null counts per column are: ")
p(df_train.isnull().sum())

After encoding the null counts per column are: 
respondent_id                  0
h1n1_concern                   0
h1n1_knowledge                 0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_h1n1               0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
health_insurance               0
opinion_h1n1_vacc_effective    0
opinion_h1n1_risk              0
opinion_h1n1_sick_from_vacc    0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
income_poverty                 0
marital_status                 0
rent_or_own                 

In [5]:
# Handling nulls three ways

handling_nulls = "median" # options "median" "iterative" "dropall"
if handling_nulls == "iterative":
    #Need to add back the NaN for the imputations.
    df_train.replace(-1, np.nan) 
    df_train.replace("no_response", np.nan) 
    
    # SMOTE Sampling
    temp_columns = df_train.columns
    imp = IterativeImputer(max_iter=10, random_state=0)
    imp.fit(df_train)
    df_train = pd.DataFrame(data=imp.transform(df_train))
    df_train.columns = temp_columns
    df_train
elif handling_nulls == "median":
    df_train_median = df_train
    #Need to add back the NaN for the imputations.
    df_train_median.replace(-1, np.nan) 
    df_train_median.replace("no_response", np.nan) 
    df_train_median.fillna(df_train.median())
elif handling_nulls == "dropall":
    df_train_drop = df_train
    #Need to add back the NaN for the imputations.
    df_train_drop.replace(-1, np.nan) 
    df_train_drop.replace("no_response", np.nan) 
    # See how it is if we drop the NaNs
    df_train_drop = df_train.dropna(inplace=False) #This should be replace with imputation.

In [6]:
handling_nulls = "dropall"
if handling_nulls == "iterative":
    #Need to add back the NaN for the imputations.
    df_train.replace(-1, np.nan) 
    df_train.replace("no_response", np.nan) 
    
    # SMOTE Sampling
    temp_columns = df_train.columns
    imp = IterativeImputer(max_iter=10, random_state=0)
    imp.fit(df_train)
    df_train = pd.DataFrame(data=imp.transform(df_train))
    df_train.columns = temp_columns
    df_train
elif handling_nulls == "median":
    df_train_median = df_train
    #Need to add back the NaN for the imputations.
    df_train_median.replace(-1, np.nan) 
    df_train_median.replace("no_response", np.nan) 
    df_train_median.fillna(df_train.median())
elif handling_nulls == "dropall":
    df_train_drop = df_train
    #Need to add back the NaN for the imputations.
    df_train_drop.replace(-1, np.nan) 
    df_train_drop.replace("no_response", np.nan) 
    # See how it is if we drop the NaNs
    df_train_drop = df_train.dropna(inplace=False) #This should be replace with imputation.
p("There should be no nulls now: ")
p(df_train_drop.isnull().sum())

There should be no nulls now: 
respondent_id                  0
h1n1_concern                   0
h1n1_knowledge                 0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_h1n1               0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
health_insurance               0
opinion_h1n1_vacc_effective    0
opinion_h1n1_risk              0
opinion_h1n1_sick_from_vacc    0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
income_poverty                 0
marital_status                 0
rent_or_own                    0
employment_s

In [7]:
# Respondent_id are all unique so its irrelevant now that we merged.
df_train = df_train.drop(columns=['respondent_id'], inplace=False)
df_train_label = df_train_label.drop(columns=['respondent_id'], inplace=False)
df_train_median = df_train_median.drop(columns=['respondent_id'], inplace=False)
df_train_drop = df_train_drop.drop(columns=['respondent_id'], inplace=False)

## Class Balancing

In [8]:
oversample = SMOTE()

# Separating the features and targets
# Original Data
X_h1n1 = df_train.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])
X_seasonal = X_h1n1
y_h1n1 = df_train['h1n1_vaccine']
y_seasonal = df_train['seasonal_vaccine']
X_h1n1, y_h1n1 = oversample.fit_resample(X_h1n1, y_h1n1)
X_seasonal, y_seasonal = oversample.fit_resample(X_seasonal, y_seasonal)

# Encoded Data
X_label_h1n1 = df_train_label.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])
X_label_seasonal = X_label_h1n1
y_label_h1n1 = df_train_label['h1n1_vaccine']
y_label_seasonal = df_train_label['seasonal_vaccine']
X_label_h1n1, y_label_h1n1 = oversample.fit_resample(X_label_h1n1, y_label_h1n1)
X_label_seasonal, y_label_seasonal = oversample.fit_resample(X_label_seasonal, y_label_seasonal)

# Nulls replaced with median data
X_median_h1n1 = df_train_median.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])
X_median_seasonal = X_median_h1n1
y_median_h1n1 = df_train_median['h1n1_vaccine']
y_median_seasonal = df_train_median['seasonal_vaccine']
X_median_h1n1, y_median_h1n1 = oversample.fit_resample(X_median_h1n1, y_median_h1n1)
X_median_seasonal, y_median_seasonal = oversample.fit_resample(X_median_seasonal, y_median_seasonal)

# Nulls dropped data
X_drop_h1n1 = df_train_drop.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])
X_drop_seasonal = X_drop_h1n1
y_drop_h1n1 = df_train_drop['h1n1_vaccine']
y_drop_seasonal = df_train_drop['seasonal_vaccine']
X_drop_h1n1, y_drop_h1n1 = oversample.fit_resample(X_drop_h1n1, y_drop_h1n1)
X_drop_seasonal, y_drop_seasonal = oversample.fit_resample(X_drop_seasonal, y_drop_seasonal)

## Preparing Train-Test-Validation Sets

In [9]:
# Splitting data into 70-20-10 train-test-validation sets

# Original Data
X_train_h1n1, X_test_h1n1, y_train_h1n1, y_test_h1n1 = train_test_split(X_h1n1, y_h1n1, train_size=.7)
X_test_h1n1, X_val_h1n1, y_test_h1n1, y_val_h1n1 = train_test_split(X_test_h1n1, y_test_h1n1, train_size=.67)

X_train_seasonal, X_test_seasonal, y_train_seasonal, y_test_seasonal = train_test_split(X_seasonal, y_seasonal, train_size=.7)
X_test_seasonal, X_val_seasonal, y_test_seasonal, y_val_seasonal = train_test_split(X_test_seasonal, y_test_seasonal, train_size=.67)

# Encoded Data
X_train_label_h1n1, X_test_label_h1n1, y_train_label_h1n1, y_test_label_h1n1 = train_test_split(X_label_h1n1, y_label_h1n1, train_size=.7)
X_test_label_h1n1, X_val_label_h1n1, y_test_label_h1n1, y_val_label_h1n1 = train_test_split(X_test_label_h1n1, y_test_label_h1n1, train_size=.67)

X_train_label_seasonal, X_test_label_seasonal, y_train_label_seasonal, y_test_label_seasonal = train_test_split(X_label_seasonal, y_label_seasonal, train_size=.7)
X_test_label_seasonal, X_val_label_seasonal, y_test_label_seasonal, y_val_label_seasonal = train_test_split(X_test_label_seasonal, y_test_label_seasonal, train_size=.67)

# Nulls replaced with median data
X_train_median_h1n1, X_test_median_h1n1, y_train_median_h1n1, y_test_median_h1n1 = train_test_split(X_median_h1n1, y_median_h1n1, train_size=.7)
X_test_median_h1n1, X_val_median_h1n1, y_test_median_h1n1, y_val_median_h1n1 = train_test_split(X_test_median_h1n1, y_test_median_h1n1, train_size=.67)

X_train_median_seasonal, X_test_median_seasonal, y_train_median_seasonal, y_test_median_seasonal = train_test_split(X_median_seasonal, y_median_seasonal, train_size=.7)
X_test_median_seasonal, X_val_median_seasonal, y_test_median_seasonal, y_val_median_seasonal = train_test_split(X_test_median_seasonal, y_test_median_seasonal, train_size=.67)

# Nulls dropped data
X_train_drop_h1n1, X_test_drop_h1n1, y_train_drop_h1n1, y_test_drop_h1n1 = train_test_split(X_drop_h1n1, y_drop_h1n1, train_size=.7)
X_test_drop_h1n1, X_val_drop_h1n1, y_test_drop_h1n1, y_val_drop_h1n1 = train_test_split(X_test_drop_h1n1, y_test_drop_h1n1, train_size=.67)

X_train_drop_seasonal, X_test_drop_seasonal, y_train_drop_seasonal, y_test_drop_seasonal = train_test_split(X_drop_seasonal, y_drop_seasonal, train_size=.7)
X_test_drop_seasonal, X_val_drop_seasonal, y_test_drop_seasonal, y_val_drop_seasonal = train_test_split(X_test_drop_seasonal, y_test_drop_seasonal, train_size=.67)

## Running Models to Determine Best Handling of Nulls

In [11]:
# Neural Network Model
NN = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), max_iter = 1000, random_state = 12345)

# Original Data
NN.fit(X_train_h1n1, y_train_h1n1)
y_pred_h1n1 = NN.predict(X_test_h1n1)
f1 = round(f1_score(y_test_h1n1, y_pred_h1n1, average='macro'), 3)
print("\nNeural Network (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_h1n1, y_pred_h1n1)

NN.fit(X_train_seasonal, y_train_seasonal)
y_pred_seasonal = NN.predict(X_test_seasonal)
f1 = round(f1_score(y_test_seasonal, y_pred_seasonal, average='macro'), 3)
print("\nNeural Network (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_seasonal, y_pred_seasonal)

#Encoded Data
NN.fit(X_train_label_h1n1, y_train_label_h1n1)
y_pred_label_h1n1 = NN.predict(X_test_label_h1n1)
f1 = round(f1_score(y_test_label_h1n1, y_pred_label_h1n1, average='macro'), 3)
print("\nNeural Network - Encoded H1N1 (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_label_h1n1, y_pred_label_h1n1)

NN.fit(X_train_label_seasonal, y_train_label_seasonal)
y_pred_label_seasonal = NN.predict(X_test_label_seasonal)
f1 = round(f1_score(y_test_label_seasonal, y_pred_label_seasonal, average='macro'), 3)
print("\nNeural Network - Encoded Seasonal Flu (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_label_seasonal, y_pred_label_seasonal)

# Median Data
NN.fit(X_train_median_h1n1, y_train_median_h1n1)
y_pred_median_h1n1 = NN.predict(X_test_median_h1n1)
f1 = round(f1_score(y_test_median_h1n1, y_pred_median_h1n1, average='macro'), 3)
print("\nNeural Network - Median Data H1N1 (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_median_h1n1, y_pred_median_h1n1)

NN.fit(X_train_median_seasonal, y_train_median_seasonal)
y_pred_median_seasonal = NN.predict(X_test_median_seasonal)
f1 = round(f1_score(y_test_median_seasonal, y_pred_median_seasonal, average='macro'), 3)
print("\nNeural Network - Median Data Seasonal Flu (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_median_seasonal, y_pred_median_seasonal)

# Nulls Dropped Data
NN.fit(X_train_drop_h1n1, y_train_drop_h1n1)
y_pred_drop_h1n1 = NN.predict(X_test_drop_h1n1)
f1 = round(f1_score(y_test_drop_h1n1, y_pred_drop_h1n1, average='macro'), 3)
print("\nNeural Network - Drop Data H1N1 (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_drop_h1n1, y_pred_drop_h1n1)

NN.fit(X_train_drop_seasonal, y_train_drop_seasonal)
y_pred_drop_seasonal = NN.predict(X_test_drop_seasonal)
f1 = round(f1_score(y_test_drop_seasonal, y_pred_drop_seasonal, average='macro'), 3)
print("\nNeural Network - Drop Data Seasonal Flu (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_drop_seasonal, y_pred_drop_seasonal)


Neural Network (using scaler inputs) f1 score:  0.334
Confusion Matrix (Accuracy 0.5015)

       Prediction
Actual    0    1
     0 4240    0
     1 4215    0

Neural Network (using scaler inputs) f1 score:  0.332
Confusion Matrix (Accuracy 0.4964)

       Prediction
Actual    0    1
     0 2848    0
     1 2889    0

Neural Network - Encoded H1N1 (using scaler inputs) f1 score:  0.334
Confusion Matrix (Accuracy 0.5022)

       Prediction
Actual    0    1
     0 4246    0
     1 4209    0

Neural Network - Encoded Seasonal Flu (using scaler inputs) f1 score:  0.332
Confusion Matrix (Accuracy 0.4964)

       Prediction
Actual    0    1
     0    0 2889
     1    0 2848

Neural Network - Median Data H1N1 (using scaler inputs) f1 score:  0.332
Confusion Matrix (Accuracy 0.4973)

       Prediction
Actual    0    1
     0 4205    0
     1 4250    0

Neural Network - Median Data Seasonal Flu (using scaler inputs) f1 score:  0.331
Confusion Matrix (Accuracy 0.4942)

       Prediction
Actual 

Neural network models predict one of the classes for all predictions, leading to minimal differences in dataset performance. The original and encoded data do very slightly better.

In [19]:
# Logistic Regression Model
logistic = LogisticRegressionCV(cv=5, penalty = 'l2', solver = 'liblinear',tol=1e-5,max_iter=1000,Cs=10, random_state = 12345)

# Original Data
logistic.fit(X_train_h1n1, y_train_h1n1)
y_pred_h1n1 = logistic.predict(X_test_h1n1)
f1 = round(f1_score(y_test_h1n1, y_pred_h1n1, average='macro'), 3)
print("\nLogistic Regression H1N1 f1 score: ", f1)
classificationSummary(y_test_h1n1, y_pred_h1n1)

logistic.fit(X_train_seasonal, y_train_seasonal)
y_pred_seasonal = logistic.predict(X_test_seasonal)
f1 = round(f1_score(y_test_seasonal, y_pred_seasonal, average='macro'), 3)
print("\nLogistic Regression Seasonal Flu f1 score: ", f1)
classificationSummary(y_test_seasonal, y_pred_seasonal)

#Encoded Data
logistic.fit(X_train_label_h1n1, y_train_label_h1n1)
y_pred_label_h1n1 = logistic.predict(X_test_label_h1n1)
f1 = round(f1_score(y_test_label_h1n1, y_pred_label_h1n1, average='macro'), 3)
print("\nLogistic Regression - Encoded H1N1 f1 score: ", f1)
classificationSummary(y_test_label_h1n1, y_pred_label_h1n1)

logistic.fit(X_train_label_seasonal, y_train_label_seasonal)
y_pred_label_seasonal = logistic.predict(X_test_label_seasonal)
f1 = round(f1_score(y_test_label_seasonal, y_pred_label_seasonal, average='macro'), 3)
print("\nLogistic Regression - Encoded Seasonal Flu f1 score: ", f1)
classificationSummary(y_test_label_seasonal, y_pred_label_seasonal)

# Median Data
logistic.fit(X_train_median_h1n1, y_train_median_h1n1)
y_pred_median_h1n1 = logistic.predict(X_test_median_h1n1)
f1 = round(f1_score(y_test_median_h1n1, y_pred_median_h1n1, average='macro'), 3)
print("\nLogistic Regression - Median Data H1N1 f1 score: ", f1)
classificationSummary(y_test_median_h1n1, y_pred_median_h1n1)

logistic.fit(X_train_median_seasonal, y_train_median_seasonal)
y_pred_median_seasonal = logistic.predict(X_test_median_seasonal)
f1 = round(f1_score(y_test_median_seasonal, y_pred_median_seasonal, average='macro'), 3)
print("\nLogistic Regression - Median Data Seasonal Flu f1 score: ", f1)
classificationSummary(y_test_median_seasonal, y_pred_median_seasonal)

# Nulls Dropped Data
logistic.fit(X_train_drop_h1n1, y_train_drop_h1n1)
y_pred_drop_h1n1 = logistic.predict(X_test_drop_h1n1)
f1 = round(f1_score(y_test_drop_h1n1, y_pred_drop_h1n1, average='macro'), 3)
print("\nLogistic Regression - Drop Data H1N1 f1 score: ", f1)
classificationSummary(y_test_drop_h1n1, y_pred_drop_h1n1)

logistic.fit(X_train_drop_seasonal, y_train_drop_seasonal)
y_pred_drop_seasonal = logistic.predict(X_test_drop_seasonal)
f1 = round(f1_score(y_test_drop_seasonal, y_pred_drop_seasonal, average='macro'), 3)
print("\nLogistic Regression - Drop Data Seasonal Flu f1 score: ", f1)
classificationSummary(y_test_drop_seasonal, y_pred_drop_seasonal)


Logistic Regression H1N1 f1 score:  0.795
Confusion Matrix (Accuracy 0.7948)

       Prediction
Actual    0    1
     0 3386  854
     1  881 3334

Logistic Regression Seasonal Flu f1 score:  0.762
Confusion Matrix (Accuracy 0.7622)

       Prediction
Actual    0    1
     0 2184  664
     1  700 2189

Logistic Regression - Encoded H1N1 f1 score:  0.797
Confusion Matrix (Accuracy 0.7966)

       Prediction
Actual    0    1
     0 3360  886
     1  834 3375

Logistic Regression - Encoded Seasonal Flu f1 score:  0.763
Confusion Matrix (Accuracy 0.7629)

       Prediction
Actual    0    1
     0 2209  680
     1  680 2168

Logistic Regression - Median Data H1N1 f1 score:  0.794
Confusion Matrix (Accuracy 0.7942)

       Prediction
Actual    0    1
     0 3380  825
     1  915 3335

Logistic Regression - Median Data Seasonal Flu f1 score:  0.766
Confusion Matrix (Accuracy 0.7661)

       Prediction
Actual    0    1
     0 2227  675
     1  667 2168

Logistic Regression - Drop Data H1N1 f1

Dropping the data seems best in the logistic regression model. 

In [20]:
# Support Vector Machine Model
SVM = svm.LinearSVC(max_iter = 5000, penalty = 'l2', loss = 'hinge', random_state = 12345)

# Original Data
SVM.fit(X_train_h1n1, y_train_h1n1)
y_pred_h1n1 = SVM.predict(X_test_h1n1)
f1 = round(f1_score(y_test_h1n1, y_pred_h1n1, average='macro'), 3)
print("\nSVM H1N1 f1 score: ", f1)
classificationSummary(y_test_h1n1, y_pred_h1n1)

SVM.fit(X_train_seasonal, y_train_seasonal)
y_pred_seasonal = SVM.predict(X_test_seasonal)
f1 = round(f1_score(y_test_seasonal, y_pred_seasonal, average='macro'), 3)
print("\nSVM Seasonal f1 score: ", f1)
classificationSummary(y_test_seasonal, y_pred_seasonal)

#Encoded Data
SVM.fit(X_train_label_h1n1, y_train_label_h1n1)
y_pred_label_h1n1 = SVM.predict(X_test_label_h1n1)
f1 = round(f1_score(y_test_label_h1n1, y_pred_label_h1n1, average='macro'), 3)
print("\nSVM - Encoded H1N1 f1 score: ", f1)
classificationSummary(y_test_label_h1n1, y_pred_label_h1n1)

SVM.fit(X_train_label_seasonal, y_train_label_seasonal)
y_pred_label_seasonal = SVM.predict(X_test_label_seasonal)
f1 = round(f1_score(y_test_label_seasonal, y_pred_label_seasonal, average='macro'), 3)
print("\nSVM - Encoded Seasonal Flu f1 score: ", f1)
classificationSummary(y_test_label_seasonal, y_pred_label_seasonal)

# Median Data
SVM.fit(X_train_median_h1n1, y_train_median_h1n1)
y_pred_median_h1n1 = SVM.predict(X_test_median_h1n1)
f1 = round(f1_score(y_test_median_h1n1, y_pred_median_h1n1, average='macro'), 3)
print("\nSVM - Median Data H1N1 f1 score: ", f1)
classificationSummary(y_test_median_h1n1, y_pred_median_h1n1)

SVM.fit(X_train_median_seasonal, y_train_median_seasonal)
y_pred_median_seasonal = SVM.predict(X_test_median_seasonal)
f1 = round(f1_score(y_test_median_seasonal, y_pred_median_seasonal, average='macro'), 3)
print("\nSVM - Median Data Seasonal Flu f1 score: ", f1)
classificationSummary(y_test_median_seasonal, y_pred_median_seasonal)

# Nulls Dropped Data
SVM.fit(X_train_drop_h1n1, y_train_drop_h1n1)
y_pred_drop_h1n1 = SVM.predict(X_test_drop_h1n1)
f1 = round(f1_score(y_test_drop_h1n1, y_pred_drop_h1n1, average='macro'), 3)
print("\nSVM - Drop Data H1N1 f1 score: ", f1)
classificationSummary(y_test_drop_h1n1, y_pred_drop_h1n1)

SVM.fit(X_train_drop_seasonal, y_train_drop_seasonal)
y_pred_drop_seasonal = SVM.predict(X_test_drop_seasonal)
f1 = round(f1_score(y_test_drop_seasonal, y_pred_drop_seasonal, average='macro'), 3)
print("\nSVM - Drop Data Seasonal Flu f1 score: ", f1)
classificationSummary(y_test_drop_seasonal, y_pred_drop_seasonal)




SVM H1N1 f1 score:  0.799
Confusion Matrix (Accuracy 0.7994)

       Prediction
Actual    0    1
     0 3427  813
     1  883 3332





SVM Seasonal f1 score:  0.765
Confusion Matrix (Accuracy 0.7654)

       Prediction
Actual    0    1
     0 2244  604
     1  742 2147





SVM - Encoded H1N1 f1 score:  0.796
Confusion Matrix (Accuracy 0.7960)

       Prediction
Actual    0    1
     0 3346  900
     1  825 3384





SVM - Encoded Seasonal Flu f1 score:  0.767
Confusion Matrix (Accuracy 0.7670)

       Prediction
Actual    0    1
     0 2238  651
     1  686 2162





SVM - Median Data H1N1 f1 score:  0.802
Confusion Matrix (Accuracy 0.8019)

       Prediction
Actual    0    1
     0 3455  750
     1  925 3325





SVM - Median Data Seasonal Flu f1 score:  0.768
Confusion Matrix (Accuracy 0.7685)

       Prediction
Actual    0    1
     0 2291  611
     1  717 2118





SVM - Drop Data H1N1 f1 score:  0.808
Confusion Matrix (Accuracy 0.8078)

       Prediction
Actual    0    1
     0 3465  781
     1  844 3365

SVM - Drop Data Seasonal Flu f1 score:  0.776
Confusion Matrix (Accuracy 0.7760)

       Prediction
Actual    0    1
     0 2261  561
     1  724 2191




Using the dropped datasets also works has the best scores for the SVM model.

In [21]:
# Gradient Boosting Classifier
gb_classif = GradientBoostingClassifier()

# Original Data
gb_classif.fit(X_train_h1n1, y_train_h1n1)
y_pred_h1n1 = gb_classif.predict(X_test_h1n1)
f1 = round(f1_score(y_test_h1n1, y_pred_h1n1, average='macro'), 3)
print("\nGradientBoosting (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_h1n1, y_pred_h1n1)

gb_classif.fit(X_train_seasonal, y_train_seasonal)
y_pred_seasonal = gb_classif.predict(X_test_seasonal)
f1 = round(f1_score(y_test_seasonal, y_pred_seasonal, average='macro'), 3)
print("\nGradientBoosting (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_seasonal, y_pred_seasonal)

#Encoded Data
gb_classif.fit(X_train_label_h1n1, y_train_label_h1n1)
y_pred_label_h1n1 = gb_classif.predict(X_test_label_h1n1)
f1 = round(f1_score(y_test_label_h1n1, y_pred_label_h1n1, average='macro'), 3)
print("\nGradientBoosting - Encoded H1N1 (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_label_h1n1, y_pred_label_h1n1)

gb_classif.fit(X_train_label_seasonal, y_train_label_seasonal)
y_pred_label_seasonal = gb_classif.predict(X_test_label_seasonal)
f1 = round(f1_score(y_test_label_seasonal, y_pred_label_seasonal, average='macro'), 3)
print("\nGradientBoosting - Encoded Seasonal Flu (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_label_seasonal, y_pred_label_seasonal)

# Median Data
gb_classif.fit(X_train_median_h1n1, y_train_median_h1n1)
y_pred_median_h1n1 = gb_classif.predict(X_test_median_h1n1)
f1 = round(f1_score(y_test_median_h1n1, y_pred_median_h1n1, average='macro'), 3)
print("\nGradientBoosting - Median Data H1N1 (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_median_h1n1, y_pred_median_h1n1)

gb_classif.fit(X_train_median_seasonal, y_train_median_seasonal)
y_pred_median_seasonal = gb_classif.predict(X_test_median_seasonal)
f1 = round(f1_score(y_test_median_seasonal, y_pred_median_seasonal, average='macro'), 3)
print("\nGradientBoosting - Median Data Seasonal Flu (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_median_seasonal, y_pred_median_seasonal)

# Nulls Dropped Data
gb_classif.fit(X_train_drop_h1n1, y_train_drop_h1n1)
y_pred_drop_h1n1 = gb_classif.predict(X_test_drop_h1n1)
f1 = round(f1_score(y_test_drop_h1n1, y_pred_drop_h1n1, average='macro'), 3)
print("\nGradientBoosting - Drop Data H1N1 (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_drop_h1n1, y_pred_drop_h1n1)

gb_classif.fit(X_train_drop_seasonal, y_train_drop_seasonal)
y_pred_drop_seasonal = gb_classif.predict(X_test_drop_seasonal)
f1 = round(f1_score(y_test_drop_seasonal, y_pred_drop_seasonal, average='macro'), 3)
print("\nGradientBoosting - Drop Data Seasonal Flu (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_drop_seasonal, y_pred_drop_seasonal)


GradientBoosting (using scaler inputs) f1 score:  0.903
Confusion Matrix (Accuracy 0.9036)

       Prediction
Actual    0    1
     0 3969  271
     1  544 3671

GradientBoosting (using scaler inputs) f1 score:  0.795
Confusion Matrix (Accuracy 0.7947)

       Prediction
Actual    0    1
     0 2270  578
     1  600 2289

GradientBoosting - Encoded H1N1 (using scaler inputs) f1 score:  0.907
Confusion Matrix (Accuracy 0.9069)

       Prediction
Actual    0    1
     0 3985  261
     1  526 3683

GradientBoosting - Encoded Seasonal Flu (using scaler inputs) f1 score:  0.798
Confusion Matrix (Accuracy 0.7976)

       Prediction
Actual    0    1
     0 2324  565
     1  596 2252

GradientBoosting - Median Data H1N1 (using scaler inputs) f1 score:  0.903
Confusion Matrix (Accuracy 0.9036)

       Prediction
Actual    0    1
     0 3961  244
     1  571 3679

GradientBoosting - Median Data Seasonal Flu (using scaler inputs) f1 score:  0.788
Confusion Matrix (Accuracy 0.7877)

       Predic

The encoded data has best metrics for the H1N1 model, slightly lower than the dropped data model, while the seasonal flu model does best with the dropped data.

In [22]:
# Gaussian Naive Bayes model
gnb = GaussianNB()

# Original Data
gnb.fit(X_train_h1n1, y_train_h1n1)
y_pred_h1n1 = gnb.predict(X_test_h1n1)
f1 = round(f1_score(y_test_h1n1, y_pred_h1n1, average='macro'), 3)
print("\nNaive Bayes f1 score: ", f1)
classificationSummary(y_test_h1n1, y_pred_h1n1)

gnb.fit(X_train_seasonal, y_train_seasonal)
y_pred_seasonal = gnb.predict(X_test_seasonal)
f1 = round(f1_score(y_test_seasonal, y_pred_seasonal, average='macro'), 3)
print("\nNaive Bayes f1 score: ", f1)
classificationSummary(y_test_seasonal, y_pred_seasonal)

#Encoded Data
gnb.fit(X_train_label_h1n1, y_train_label_h1n1)
y_pred_label_h1n1 = gnb.predict(X_test_label_h1n1)
f1 = round(f1_score(y_test_label_h1n1, y_pred_label_h1n1, average='macro'), 3)
print("\nNaive Bayes - Encoded H1N1 (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_label_h1n1, y_pred_label_h1n1)

gnb.fit(X_train_label_seasonal, y_train_label_seasonal)
y_pred_label_seasonal = gnb.predict(X_test_label_seasonal)
f1 = round(f1_score(y_test_label_seasonal, y_pred_label_seasonal, average='macro'), 3)
print("\nNaive Bayes - Encoded Seasonal Flu (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_label_seasonal, y_pred_label_seasonal)

# Median Data
gnb.fit(X_train_median_h1n1, y_train_median_h1n1)
y_pred_median_h1n1 = gnb.predict(X_test_median_h1n1)
f1 = round(f1_score(y_test_median_h1n1, y_pred_median_h1n1, average='macro'), 3)
print("\nNaive Bayes - Median Data H1N1 (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_median_h1n1, y_pred_median_h1n1)

gnb.fit(X_train_median_seasonal, y_train_median_seasonal)
y_pred_median_seasonal = gnb.predict(X_test_median_seasonal)
f1 = round(f1_score(y_test_median_seasonal, y_pred_median_seasonal, average='macro'), 3)
print("\nNaive Bayes - Median Data Seasonal Flu (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_median_seasonal, y_pred_median_seasonal)

# Nulls Dropped Data
gnb.fit(X_train_drop_h1n1, y_train_drop_h1n1)
y_pred_drop_h1n1 = gnb.predict(X_test_drop_h1n1)
f1 = round(f1_score(y_test_drop_h1n1, y_pred_drop_h1n1, average='macro'), 3)
print("\nNaive Bayes - Drop Data H1N1 (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_drop_h1n1, y_pred_drop_h1n1)

gnb.fit(X_train_drop_seasonal, y_train_drop_seasonal)
y_pred_drop_seasonal = gnb.predict(X_test_drop_seasonal)
f1 = round(f1_score(y_test_drop_seasonal, y_pred_drop_seasonal, average='macro'), 3)
print("\nNaive Bayes - Drop Data Seasonal Flu (using scaler inputs) f1 score: ", f1)
classificationSummary(y_test_drop_seasonal, y_pred_drop_seasonal)


Naive Bayes f1 score:  0.747
Confusion Matrix (Accuracy 0.7473)

       Prediction
Actual    0    1
     0 2969 1271
     1  866 3349

Naive Bayes f1 score:  0.726
Confusion Matrix (Accuracy 0.7260)

       Prediction
Actual    0    1
     0 1981  867
     1  705 2184

Naive Bayes - Encoded H1N1 (using scaler inputs) f1 score:  0.745
Confusion Matrix (Accuracy 0.7455)

       Prediction
Actual    0    1
     0 2958 1288
     1  864 3345

Naive Bayes - Encoded Seasonal Flu (using scaler inputs) f1 score:  0.732
Confusion Matrix (Accuracy 0.7324)

       Prediction
Actual    0    1
     0 2023  866
     1  669 2179

Naive Bayes - Median Data H1N1 (using scaler inputs) f1 score:  0.746
Confusion Matrix (Accuracy 0.7459)

       Prediction
Actual    0    1
     0 2980 1225
     1  923 3327

Naive Bayes - Median Data Seasonal Flu (using scaler inputs) f1 score:  0.725
Confusion Matrix (Accuracy 0.7248)

       Prediction
Actual    0    1
     0 2018  884
     1  695 2140

Naive Bayes - Dro

The seasonal flu model does best with the encoded data while the Naive Bayes model does best with the Dropped Data.

In [23]:
# K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=3, weights = 'distance')

# Original Data
knn.fit(X_train_h1n1, y_train_h1n1)
y_pred_h1n1 = knn.predict(X_test_h1n1)
f1 = round(f1_score(y_test_h1n1, y_pred_h1n1, average='macro'), 3)
print("\nKNN f1 score: ", f1)
classificationSummary(y_test_h1n1, y_pred_h1n1)

knn.fit(X_train_seasonal, y_train_seasonal)
y_pred_seasonal = knn.predict(X_test_seasonal)
f1 = round(f1_score(y_test_seasonal, y_pred_seasonal, average='macro'), 3)
print("\nKNN f1 score: ", f1)
classificationSummary(y_test_seasonal, y_pred_seasonal)

#Encoded Data
knn.fit(X_train_label_h1n1, y_train_label_h1n1)
y_pred_label_h1n1 = knn.predict(X_test_label_h1n1)
f1 = round(f1_score(y_test_label_h1n1, y_pred_label_h1n1, average='macro'), 3)
print("\nKNN - Encoded H1N1 f1 score: ", f1)
classificationSummary(y_test_label_h1n1, y_pred_label_h1n1)

knn.fit(X_train_label_seasonal, y_train_label_seasonal)
y_pred_label_seasonal = knn.predict(X_test_label_seasonal)
f1 = round(f1_score(y_test_label_seasonal, y_pred_label_seasonal, average='macro'), 3)
print("\nKNN - Encoded Seasonal Flu f1 score: ", f1)
classificationSummary(y_test_label_seasonal, y_pred_label_seasonal)

# Median Data
knn.fit(X_train_median_h1n1, y_train_median_h1n1)
y_pred_median_h1n1 = knn.predict(X_test_median_h1n1)
f1 = round(f1_score(y_test_median_h1n1, y_pred_median_h1n1, average='macro'), 3)
print("\nKNN - Median Data H1N1 f1 score: ", f1)
classificationSummary(y_test_median_h1n1, y_pred_median_h1n1)

knn.fit(X_train_median_seasonal, y_train_median_seasonal)
y_pred_median_seasonal = knn.predict(X_test_median_seasonal)
f1 = round(f1_score(y_test_median_seasonal, y_pred_median_seasonal, average='macro'), 3)
print("\nKNN - Median Data Seasonal Flu f1 score: ", f1)
classificationSummary(y_test_median_seasonal, y_pred_median_seasonal)

# Nulls Dropped Data
knn.fit(X_train_drop_h1n1, y_train_drop_h1n1)
y_pred_drop_h1n1 = knn.predict(X_test_drop_h1n1)
f1 = round(f1_score(y_test_drop_h1n1, y_pred_drop_h1n1, average='macro'), 3)
print("\nKNN - Drop Data H1N1 f1 score: ", f1)
classificationSummary(y_test_drop_h1n1, y_pred_drop_h1n1)

knn.fit(X_train_drop_seasonal, y_train_drop_seasonal)
y_pred_drop_seasonal = knn.predict(X_test_drop_seasonal)
f1 = round(f1_score(y_test_drop_seasonal, y_pred_drop_seasonal, average='macro'), 3)
print("\nKNN - Drop Data Seasonal Flu f1 score: ", f1)
classificationSummary(y_test_drop_seasonal, y_pred_drop_seasonal)


KNN f1 score:  0.816
Confusion Matrix (Accuracy 0.8202)

       Prediction
Actual    0    1
     0 2813 1427
     1   93 4122

KNN f1 score:  0.723
Confusion Matrix (Accuracy 0.7230)

       Prediction
Actual    0    1
     0 1962  886
     1  703 2186

KNN - Encoded H1N1 f1 score:  0.817
Confusion Matrix (Accuracy 0.8216)

       Prediction
Actual    0    1
     0 2814 1432
     1   76 4133

KNN - Encoded Seasonal Flu f1 score:  0.714
Confusion Matrix (Accuracy 0.7147)

       Prediction
Actual    0    1
     0 1917  972
     1  665 2183

KNN - Median Data H1N1 f1 score:  0.805
Confusion Matrix (Accuracy 0.8111)

       Prediction
Actual    0    1
     0 2693 1512
     1   85 4165

KNN - Median Data Seasonal Flu f1 score:  0.718
Confusion Matrix (Accuracy 0.7185)

       Prediction
Actual    0    1
     0 1961  941
     1  674 2161

KNN - Drop Data H1N1 f1 score:  0.818
Confusion Matrix (Accuracy 0.8216)

       Prediction
Actual    0    1
     0 2837 1409
     1   99 4110

KNN - Dro

The H1N1 model does best with the dropped data, while the seasonal flu model does best with the original data.

In [24]:
# Random Forest model
rf = RandomForestClassifier(max_depth=2, random_state = 12345)

# Original Data
rf.fit(X_train_h1n1, y_train_h1n1)
y_pred_h1n1 = rf.predict(X_test_h1n1)
f1 = round(f1_score(y_test_h1n1, y_pred_h1n1, average='macro'), 3)
print("\nRandom Forest f1 score: ", f1)
classificationSummary(y_test_h1n1, y_pred_h1n1)

rf.fit(X_train_seasonal, y_train_seasonal)
y_pred_seasonal = rf.predict(X_test_seasonal)
f1 = round(f1_score(y_test_seasonal, y_pred_seasonal, average='macro'), 3)
print("\nRandom Forest f1 score: ", f1)
classificationSummary(y_test_seasonal, y_pred_seasonal)

#Encoded Data
rf.fit(X_train_label_h1n1, y_train_label_h1n1)
y_pred_label_h1n1 = rf.predict(X_test_label_h1n1)
f1 = round(f1_score(y_test_label_h1n1, y_pred_label_h1n1, average='macro'), 3)
print("\nRandom Forest - Encoded H1N1 f1 score: ", f1)
classificationSummary(y_test_label_h1n1, y_pred_label_h1n1)

rf.fit(X_train_label_seasonal, y_train_label_seasonal)
y_pred_label_seasonal = rf.predict(X_test_label_seasonal)
f1 = round(f1_score(y_test_label_seasonal, y_pred_label_seasonal, average='macro'), 3)
print("\nRandom Forest - Encoded Seasonal Flu f1 score: ", f1)
classificationSummary(y_test_label_seasonal, y_pred_label_seasonal)

# Median Data
rf.fit(X_train_median_h1n1, y_train_median_h1n1)
y_pred_median_h1n1 = rf.predict(X_test_median_h1n1)
f1 = round(f1_score(y_test_median_h1n1, y_pred_median_h1n1, average='macro'), 3)
print("\nRandom Forest - Median Data H1N1 f1 score: ", f1)
classificationSummary(y_test_median_h1n1, y_pred_median_h1n1)

rf.fit(X_train_median_seasonal, y_train_median_seasonal)
y_pred_median_seasonal = rf.predict(X_test_median_seasonal)
f1 = round(f1_score(y_test_median_seasonal, y_pred_median_seasonal, average='macro'), 3)
print("\nRandom Forest - Median Data Seasonal Flu f1 score: ", f1)
classificationSummary(y_test_median_seasonal, y_pred_median_seasonal)

# Nulls Dropped Data
rf.fit(X_train_drop_h1n1, y_train_drop_h1n1)
y_pred_drop_h1n1 = rf.predict(X_test_drop_h1n1)
f1 = round(f1_score(y_test_drop_h1n1, y_pred_drop_h1n1, average='macro'), 3)
print("\nRandom Forest - Drop Data H1N1 f1 score: ", f1)
classificationSummary(y_test_drop_h1n1, y_pred_drop_h1n1)

rf.fit(X_train_drop_seasonal, y_train_drop_seasonal)
y_pred_drop_seasonal = rf.predict(X_test_drop_seasonal)
f1 = round(f1_score(y_test_drop_seasonal, y_pred_drop_seasonal, average='macro'), 3)
print("\nRandom Forest - Drop Data Seasonal Flu f1 score: ", f1)
classificationSummary(y_test_drop_seasonal, y_pred_drop_seasonal)


Random Forest f1 score:  0.825
Confusion Matrix (Accuracy 0.8251)

       Prediction
Actual    0    1
     0 3613  627
     1  852 3363

Random Forest f1 score:  0.75
Confusion Matrix (Accuracy 0.7506)

       Prediction
Actual    0    1
     0 2237  611
     1  820 2069

Random Forest - Encoded H1N1 f1 score:  0.828
Confusion Matrix (Accuracy 0.8283)

       Prediction
Actual    0    1
     0 3605  641
     1  811 3398

Random Forest - Encoded Seasonal Flu f1 score:  0.759
Confusion Matrix (Accuracy 0.7591)

       Prediction
Actual    0    1
     0 2280  609
     1  773 2075

Random Forest - Median Data H1N1 f1 score:  0.823
Confusion Matrix (Accuracy 0.8232)

       Prediction
Actual    0    1
     0 3594  611
     1  884 3366

Random Forest - Median Data Seasonal Flu f1 score:  0.744
Confusion Matrix (Accuracy 0.7439)

       Prediction
Actual    0    1
     0 2253  649
     1  820 2015

Random Forest - Drop Data H1N1 f1 score:  0.823
Confusion Matrix (Accuracy 0.8231)

       Pre

The H1N1 model does best with the encoded data, while the Seasonal Flu model does best with the dropped data.

In [26]:
# Boosted Classifier
adaboost = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 2), learning_rate = 1.5, n_estimators=400, random_state = 12345)

# Original Data
adaboost.fit(X_train_h1n1, y_train_h1n1)
y_pred_h1n1 = adaboost.predict(X_test_h1n1)
f1 = round(f1_score(y_test_h1n1, y_pred_h1n1, average='macro'), 3)
print("\nAdaboost f1 score: ", f1)
classificationSummary(y_test_h1n1, y_pred_h1n1)

adaboost.fit(X_train_seasonal, y_train_seasonal)
y_pred_seasonal = adaboost.predict(X_test_seasonal)
f1 = round(f1_score(y_test_seasonal, y_pred_seasonal, average='macro'), 3)
print("\nAdaboost f1 score: ", f1)
classificationSummary(y_test_seasonal, y_pred_seasonal)

#Encoded Data
adaboost.fit(X_train_label_h1n1, y_train_label_h1n1)
y_pred_label_h1n1 = adaboost.predict(X_test_label_h1n1)
f1 = round(f1_score(y_test_label_h1n1, y_pred_label_h1n1, average='macro'), 3)
print("\nAdaboost - Encoded H1N1 f1 score: ", f1)
classificationSummary(y_test_label_h1n1, y_pred_label_h1n1)

adaboost.fit(X_train_label_seasonal, y_train_label_seasonal)
y_pred_label_seasonal = adaboost.predict(X_test_label_seasonal)
f1 = round(f1_score(y_test_label_seasonal, y_pred_label_seasonal, average='macro'), 3)
print("\nAdaboost - Encoded Seasonal Flu f1 score: ", f1)
classificationSummary(y_test_label_seasonal, y_pred_label_seasonal)

# Median Data
adaboost.fit(X_train_median_h1n1, y_train_median_h1n1)
y_pred_median_h1n1 = adaboost.predict(X_test_median_h1n1)
f1 = round(f1_score(y_test_median_h1n1, y_pred_median_h1n1, average='macro'), 3)
print("\nAdaboost - Median Data H1N1 f1 score: ", f1)
classificationSummary(y_test_median_h1n1, y_pred_median_h1n1)

adaboost.fit(X_train_median_seasonal, y_train_median_seasonal)
y_pred_median_seasonal = adaboost.predict(X_test_median_seasonal)
f1 = round(f1_score(y_test_median_seasonal, y_pred_median_seasonal, average='macro'), 3)
print("\nAdaboost - Median Data Seasonal Flu f1 score: ", f1)
classificationSummary(y_test_median_seasonal, y_pred_median_seasonal)

# Nulls Dropped Data
adaboost.fit(X_train_drop_h1n1, y_train_drop_h1n1)
y_pred_drop_h1n1 = adaboost.predict(X_test_drop_h1n1)
f1 = round(f1_score(y_test_drop_h1n1, y_pred_drop_h1n1, average='macro'), 3)
print("\nAdaboost - Drop Data H1N1 f1 score: ", f1)
classificationSummary(y_test_drop_h1n1, y_pred_drop_h1n1)

adaboost.fit(X_train_drop_seasonal, y_train_drop_seasonal)
y_pred_drop_seasonal = adaboost.predict(X_test_drop_seasonal)
f1 = round(f1_score(y_test_drop_seasonal, y_pred_drop_seasonal, average='macro'), 3)
print("\nAdaboost - Drop Data Seasonal Flu f1 score: ", f1)
classificationSummary(y_test_drop_seasonal, y_pred_drop_seasonal)


Adaboost f1 score:  0.898
Confusion Matrix (Accuracy 0.8980)

       Prediction
Actual    0    1
     0 3896  344
     1  518 3697

Adaboost f1 score:  0.791
Confusion Matrix (Accuracy 0.7910)

       Prediction
Actual    0    1
     0 2267  581
     1  618 2271

Adaboost - Encoded H1N1 f1 score:  0.897
Confusion Matrix (Accuracy 0.8969)

       Prediction
Actual    0    1
     0 3891  355
     1  517 3692

Adaboost - Encoded Seasonal Flu f1 score:  0.783
Confusion Matrix (Accuracy 0.7828)

       Prediction
Actual    0    1
     0 2293  596
     1  650 2198

Adaboost - Median Data H1N1 f1 score:  0.898
Confusion Matrix (Accuracy 0.8976)

       Prediction
Actual    0    1
     0 3883  322
     1  544 3706

Adaboost - Median Data Seasonal Flu f1 score:  0.768
Confusion Matrix (Accuracy 0.7680)

       Prediction
Actual    0    1
     0 2243  659
     1  672 2163

Adaboost - Drop Data H1N1 f1 score:  0.897
Confusion Matrix (Accuracy 0.8969)

       Prediction
Actual    0    1
     0 39

The H1N1 and Seasonal Flu models do best with the original data. 

## Normalization and Standardization

In [28]:
# Data Normalization with sklearn

# Fitting a scaler on the training datasets
normh1n1= MinMaxScaler().fit(X_train_h1n1)
normseasonal = MinMaxScaler().fit(X_train_seasonal)

# Transforming the training datasets
X_train_norm_h1n1 = normh1n1.transform(X_train_h1n1)
X_train_norm_seasonal = normseasonal.transform(X_train_seasonal)

# transform the testing dataset
X_test_norm_h1n1 = normh1n1.transform(X_test_h1n1)
X_test_norm_seasonal = normseasonal.transform(X_test_seasonal)

In [31]:
# Data standardization with sklearn

# Copy the two datasets
X_train_stand_h1n1 = X_train_h1n1.copy()
X_train_stand_seasonal = X_train_seasonal.copy()
X_test_stand_h1n1 = X_test_h1n1.copy()
X_test_stand_seasonal = X_test_seasonal.copy()

# Group the numerical features and not categorical

num_cols = ['h1n1_concern','h1n1_knowledge','behavioral_antiviral_meds','behavioral_avoidance','behavioral_face_mask','behavioral_large_gatherings',
'behavioral_outside_home','behavioral_wash_hands','behavioral_touch_face','doctor_recc_h1n1','doctor_recc_seasonal','chronic_med_condition',
'child_under_6_months','health_worker','health_insurance','opinion_h1n1_vacc_effective','opinion_h1n1_sick_from_vacc','opinion_h1n1_risk',
'opinion_seas_sick_from_vacc','household_adults','household_children']

# Apply standardization on the numerical features
for i in num_cols:
    
    # Fit the scaler on the training data column
    scale_h1n1 = StandardScaler().fit(X_train_stand_h1n1[[i]])
    scale_seasonal = StandardScaler().fit(X_train_stand_seasonal[[i]])
    
    # Transform the training data column
    X_train_stand_h1n1[i] = scale_h1n1.transform(X_train_stand_h1n1[[i]])
    X_train_stand_seasonal[i] = scale_seasonal.transform(X_train_stand_seasonal[[i]])
    
    # Transform the testing data column
    X_test_stand_h1n1[i] = scale_h1n1.transform(X_test_stand_h1n1[[i]])
    X_test_stand_seasonal[i] = scale_seasonal.transform(X_test_stand_seasonal[[i]])

## Training Normalized/Standardized Models

In [33]:
# Training a Neural Network model

# Neural Network Model
NN = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), max_iter = 1000, random_state = 12345)

acc_h1n1 = []
f1_h1n1 = []
acc_seasonal = []
f1_seasonal = []

# Normalized and standardized training and testing data
trainX_h1n1 = [X_train_h1n1, X_train_norm_h1n1, X_train_stand_h1n1]
testX_h1n1 = [X_test_h1n1, X_test_norm_h1n1, X_test_stand_h1n1]

trainX_seasonal = [X_train_seasonal, X_train_norm_seasonal, X_train_stand_seasonal]
testX_seasonal = [X_test_seasonal, X_test_norm_seasonal, X_test_stand_seasonal]

for i in range(len(trainX_h1n1)):   
    # model fitting
    model = NN.fit(trainX_h1n1[i],y_train_h1n1)    
    # model prediction
    pred = model.predict(testX_h1n1[i])
    # measuring RMSE
    acc_h1n1.append(accuracy_score(y_test_h1n1,pred))
    f1_h1n1.append(round(f1_score(y_test_h1n1, pred, average='macro'), 3))
    
for i in range(len(trainX_seasonal)):   
    # model fitting
    model = NN.fit(trainX_seasonal[i],y_train_seasonal)    
    # model prediction
    pred = model.predict(testX_seasonal[i])
    # measuring RMSE
    acc_seasonal.append(accuracy_score(y_test_seasonal,pred))
    f1_seasonal.append(round(f1_score(y_test_seasonal, pred, average='macro'), 3))
    

# results    
df_svr = pd.DataFrame({'Acc H1N1':acc_h1n1, 'F1 H1N1':f1_h1n1, 'Acc Flu':acc_seasonal, 'F1 Flu':f1_seasonal},index=['Original','Normalized','Standardized'])
df_svr

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Unnamed: 0,Acc H1N1,F1 H1N1,Acc Flu,F1 Flu
Original,0.501478,0.334,0.496427,0.332
Normalized,0.8123,0.812,0.773575,0.774
Standardized,0.811591,0.812,0.772878,0.773


The normalized data performs best for the Neural Network Model.

In [34]:
# Logistic Regression Model
logistic = LogisticRegressionCV(cv=5, penalty = 'l2', solver = 'liblinear',tol=1e-5,max_iter=1000,Cs=10, random_state = 12345)

acc_h1n1 = []
f1_h1n1 = []
acc_seasonal = []
f1_seasonal = []

# Normalized and standardized training and testing data
trainX_h1n1 = [X_train_h1n1, X_train_norm_h1n1, X_train_stand_h1n1]
testX_h1n1 = [X_test_h1n1, X_test_norm_h1n1, X_test_stand_h1n1]

trainX_seasonal = [X_train_seasonal, X_train_norm_seasonal, X_train_stand_seasonal]
testX_seasonal = [X_test_seasonal, X_test_norm_seasonal, X_test_stand_seasonal]

for i in range(len(trainX_h1n1)):   
    # model fitting
    model = logistic.fit(trainX_h1n1[i],y_train_h1n1)    
    # model prediction
    pred = model.predict(testX_h1n1[i])
    # measuring RMSE
    acc_h1n1.append(accuracy_score(y_test_h1n1,pred))
    f1_h1n1.append(round(f1_score(y_test_h1n1, pred, average='macro'), 3))
    
for i in range(len(trainX_seasonal)):   
    # model fitting
    model = logistic.fit(trainX_seasonal[i],y_train_seasonal)    
    # model prediction
    pred = model.predict(testX_seasonal[i])
    # measuring RMSE
    acc_seasonal.append(accuracy_score(y_test_seasonal,pred))
    f1_seasonal.append(round(f1_score(y_test_seasonal, pred, average='macro'), 3))
    

# results    
df_logistic = pd.DataFrame({'Acc H1N1':acc_h1n1, 'F1 H1N1':f1_h1n1, 'Acc Flu':acc_seasonal, 'F1 Flu':f1_seasonal},index=['Original','Normalized','Standardized'])
df_logistic

Unnamed: 0,Acc H1N1,F1 H1N1,Acc Flu,F1 Flu
Original,0.794796,0.795,0.762245,0.762
Normalized,0.794796,0.795,0.762594,0.763
Standardized,0.794914,0.795,0.762594,0.763


The accuracy for the standardized H1N1 model ahs higher accuracy than the others, but otherwise the normalized and standardized data have similar results.

In [35]:
# Support Vector Machine Model
SVM = svm.LinearSVC(max_iter = 5000, penalty = 'l2', loss = 'hinge', random_state = 12345)

acc_h1n1 = []
f1_h1n1 = []
acc_seasonal = []
f1_seasonal = []

# Normalized and standardized training and testing data
trainX_h1n1 = [X_train_h1n1, X_train_norm_h1n1, X_train_stand_h1n1]
testX_h1n1 = [X_test_h1n1, X_test_norm_h1n1, X_test_stand_h1n1]

trainX_seasonal = [X_train_seasonal, X_train_norm_seasonal, X_train_stand_seasonal]
testX_seasonal = [X_test_seasonal, X_test_norm_seasonal, X_test_stand_seasonal]

for i in range(len(trainX_h1n1)):   
    # model fitting
    model = SVM.fit(trainX_h1n1[i],y_train_h1n1)    
    # model prediction
    pred = model.predict(testX_h1n1[i])
    # measuring RMSE
    acc_h1n1.append(accuracy_score(y_test_h1n1,pred))
    f1_h1n1.append(round(f1_score(y_test_h1n1, pred, average='macro'), 3))
    
for i in range(len(trainX_seasonal)):   
    # model fitting
    model = SVM.fit(trainX_seasonal[i],y_train_seasonal)    
    # model prediction
    pred = model.predict(testX_seasonal[i])
    # measuring RMSE
    acc_seasonal.append(accuracy_score(y_test_seasonal,pred))
    f1_seasonal.append(round(f1_score(y_test_seasonal, pred, average='macro'), 3))
    

# results    
df_SVM = pd.DataFrame({'Acc H1N1':acc_h1n1, 'F1 H1N1':f1_h1n1, 'Acc Flu':acc_seasonal, 'F1 Flu':f1_seasonal},index=['Original','Normalized','Standardized'])
df_SVM



Unnamed: 0,Acc H1N1,F1 H1N1,Acc Flu,F1 Flu
Original,0.799409,0.799,0.765383,0.765
Normalized,0.800473,0.8,0.765731,0.766
Standardized,0.800473,0.8,0.76608,0.766


The standardized data has slightly better accuracy for the flu model. 

In [36]:
# Gradient Boosting Classifier
gb_classif = GradientBoostingClassifier()

acc_h1n1 = []
f1_h1n1 = []
acc_seasonal = []
f1_seasonal = []

# Normalized and standardized training and testing data
trainX_h1n1 = [X_train_h1n1, X_train_norm_h1n1, X_train_stand_h1n1]
testX_h1n1 = [X_test_h1n1, X_test_norm_h1n1, X_test_stand_h1n1]

trainX_seasonal = [X_train_seasonal, X_train_norm_seasonal, X_train_stand_seasonal]
testX_seasonal = [X_test_seasonal, X_test_norm_seasonal, X_test_stand_seasonal]

for i in range(len(trainX_h1n1)):   
    # model fitting
    model = gb_classif.fit(trainX_h1n1[i],y_train_h1n1)    
    # model prediction
    pred = model.predict(testX_h1n1[i])
    # measuring RMSE
    acc_h1n1.append(accuracy_score(y_test_h1n1,pred))
    f1_h1n1.append(round(f1_score(y_test_h1n1, pred, average='macro'), 3))
    
for i in range(len(trainX_seasonal)):   
    # model fitting
    model = gb_classif.fit(trainX_seasonal[i],y_train_seasonal)    
    # model prediction
    pred = model.predict(testX_seasonal[i])
    # measuring RMSE
    acc_seasonal.append(accuracy_score(y_test_seasonal,pred))
    f1_seasonal.append(round(f1_score(y_test_seasonal, pred, average='macro'), 3))
    

# results    
df_gb_classif = pd.DataFrame({'Acc H1N1':acc_h1n1, 'F1 H1N1':f1_h1n1, 'Acc Flu':acc_seasonal, 'F1 Flu':f1_seasonal},index=['Original','Normalized','Standardized'])
df_gb_classif

Unnamed: 0,Acc H1N1,F1 H1N1,Acc Flu,F1 Flu
Original,0.903607,0.903,0.794666,0.795
Normalized,0.903607,0.903,0.794666,0.795
Standardized,0.903607,0.903,0.794666,0.795


The Gradient Boosting Classifier models all have similar metrics.

In [37]:
# Gaussian Naive Bayes model
gnb = GaussianNB()

acc_h1n1 = []
f1_h1n1 = []
acc_seasonal = []
f1_seasonal = []

# Normalized and standardized training and testing data
trainX_h1n1 = [X_train_h1n1, X_train_norm_h1n1, X_train_stand_h1n1]
testX_h1n1 = [X_test_h1n1, X_test_norm_h1n1, X_test_stand_h1n1]

trainX_seasonal = [X_train_seasonal, X_train_norm_seasonal, X_train_stand_seasonal]
testX_seasonal = [X_test_seasonal, X_test_norm_seasonal, X_test_stand_seasonal]

for i in range(len(trainX_h1n1)):   
    # model fitting
    model = gnb.fit(trainX_h1n1[i],y_train_h1n1)    
    # model prediction
    pred = model.predict(testX_h1n1[i])
    # measuring RMSE
    acc_h1n1.append(accuracy_score(y_test_h1n1,pred))
    f1_h1n1.append(round(f1_score(y_test_h1n1, pred, average='macro'), 3))
    
for i in range(len(trainX_seasonal)):   
    # model fitting
    model = gnb.fit(trainX_seasonal[i],y_train_seasonal)    
    # model prediction
    pred = model.predict(testX_seasonal[i])
    # measuring RMSE
    acc_seasonal.append(accuracy_score(y_test_seasonal,pred))
    f1_seasonal.append(round(f1_score(y_test_seasonal, pred, average='macro'), 3))
    

# results    
df_gnb = pd.DataFrame({'Acc H1N1':acc_h1n1, 'F1 H1N1':f1_h1n1, 'Acc Flu':acc_seasonal, 'F1 Flu':f1_seasonal},index=['Original','Normalized','Standardized'])
df_gnb

Unnamed: 0,Acc H1N1,F1 H1N1,Acc Flu,F1 Flu
Original,0.74725,0.747,0.725989,0.726
Normalized,0.74725,0.747,0.725989,0.726
Standardized,0.74725,0.747,0.725989,0.726


The Naive Bayes models don't have a statistical difference between them.

In [38]:
# K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5, weights = 'distance')

acc_h1n1 = []
f1_h1n1 = []
acc_seasonal = []
f1_seasonal = []

# Normalized and standardized training and testing data
trainX_h1n1 = [X_train_h1n1, X_train_norm_h1n1, X_train_stand_h1n1]
testX_h1n1 = [X_test_h1n1, X_test_norm_h1n1, X_test_stand_h1n1]

trainX_seasonal = [X_train_seasonal, X_train_norm_seasonal, X_train_stand_seasonal]
testX_seasonal = [X_test_seasonal, X_test_norm_seasonal, X_test_stand_seasonal]

for i in range(len(trainX_h1n1)):   
    # model fitting
    model = knn.fit(trainX_h1n1[i],y_train_h1n1)    
    # model prediction
    pred = model.predict(testX_h1n1[i])
    # measuring RMSE
    acc_h1n1.append(accuracy_score(y_test_h1n1,pred))
    f1_h1n1.append(round(f1_score(y_test_h1n1, pred, average='macro'), 3))
    
for i in range(len(trainX_seasonal)):   
    # model fitting
    model = knn.fit(trainX_seasonal[i],y_train_seasonal)    
    # model prediction
    pred = model.predict(testX_seasonal[i])
    # measuring RMSE
    acc_seasonal.append(accuracy_score(y_test_seasonal,pred))
    f1_seasonal.append(round(f1_score(y_test_seasonal, pred, average='macro'), 3))
    

# results    
df_knn = pd.DataFrame({'Acc H1N1':acc_h1n1, 'F1 H1N1':f1_h1n1, 'Acc Flu':acc_seasonal, 'F1 Flu':f1_seasonal},index=['Original','Normalized','Standardized'])
df_knn

Unnamed: 0,Acc H1N1,F1 H1N1,Acc Flu,F1 Flu
Original,0.808989,0.804,0.729301,0.729
Normalized,0.824128,0.821,0.720934,0.721
Standardized,0.815494,0.811,0.736273,0.736


The normalized data has higher metrics for the H1N1 models, but the standardized data has higher metrics for the seasonal flu models.

In [39]:
# Random Forest model
rf = RandomForestClassifier(max_depth=2, random_state = 12345)

acc_h1n1 = []
f1_h1n1 = []
acc_seasonal = []
f1_seasonal = []

# Normalized and standardized training and testing data
trainX_h1n1 = [X_train_h1n1, X_train_norm_h1n1, X_train_stand_h1n1]
testX_h1n1 = [X_test_h1n1, X_test_norm_h1n1, X_test_stand_h1n1]

trainX_seasonal = [X_train_seasonal, X_train_norm_seasonal, X_train_stand_seasonal]
testX_seasonal = [X_test_seasonal, X_test_norm_seasonal, X_test_stand_seasonal]

for i in range(len(trainX_h1n1)):   
    # model fitting
    model = rf.fit(trainX_h1n1[i],y_train_h1n1)    
    # model prediction
    pred = model.predict(testX_h1n1[i])
    # measuring RMSE
    acc_h1n1.append(accuracy_score(y_test_h1n1,pred))
    f1_h1n1.append(round(f1_score(y_test_h1n1, pred, average='macro'), 3))
    
for i in range(len(trainX_seasonal)):   
    # model fitting
    model = rf.fit(trainX_seasonal[i],y_train_seasonal)    
    # model prediction
    pred = model.predict(testX_seasonal[i])
    # measuring RMSE
    acc_seasonal.append(accuracy_score(y_test_seasonal,pred))
    f1_seasonal.append(round(f1_score(y_test_seasonal, pred, average='macro'), 3))
    

# results    
df_rf = pd.DataFrame({'Acc H1N1':acc_h1n1, 'F1 H1N1':f1_h1n1, 'Acc Flu':acc_seasonal, 'F1 Flu':f1_seasonal},index=['Original','Normalized','Standardized'])
df_rf

Unnamed: 0,Acc H1N1,F1 H1N1,Acc Flu,F1 Flu
Original,0.825074,0.825,0.750566,0.75
Normalized,0.825074,0.825,0.750566,0.75
Standardized,0.825074,0.825,0.750566,0.75


The Random Forest models don't have a statistical difference between them.

In [40]:
# Boosted Classifier
adaboost = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 2), learning_rate = 1.5, n_estimators=400, random_state = 12345)

acc_h1n1 = []
f1_h1n1 = []
acc_seasonal = []
f1_seasonal = []

# Normalized and standardized training and testing data
trainX_h1n1 = [X_train_h1n1, X_train_norm_h1n1, X_train_stand_h1n1]
testX_h1n1 = [X_test_h1n1, X_test_norm_h1n1, X_test_stand_h1n1]

trainX_seasonal = [X_train_seasonal, X_train_norm_seasonal, X_train_stand_seasonal]
testX_seasonal = [X_test_seasonal, X_test_norm_seasonal, X_test_stand_seasonal]

for i in range(len(trainX_h1n1)):   
    # model fitting
    model = adaboost.fit(trainX_h1n1[i],y_train_h1n1)    
    # model prediction
    pred = model.predict(testX_h1n1[i])
    # measuring RMSE
    acc_h1n1.append(accuracy_score(y_test_h1n1,pred))
    f1_h1n1.append(round(f1_score(y_test_h1n1, pred, average='macro'), 3))
    
for i in range(len(trainX_seasonal)):   
    # model fitting
    model = adaboost.fit(trainX_seasonal[i],y_train_seasonal)    
    # model prediction
    pred = model.predict(testX_seasonal[i])
    # measuring RMSE
    acc_seasonal.append(accuracy_score(y_test_seasonal,pred))
    f1_seasonal.append(round(f1_score(y_test_seasonal, pred, average='macro'), 3))
    

# results    
df_adaboost = pd.DataFrame({'Acc H1N1':acc_h1n1, 'F1 H1N1':f1_h1n1, 'Acc Flu':acc_seasonal, 'F1 Flu':f1_seasonal},index=['Original','Normalized','Standardized'])
df_adaboost

Unnamed: 0,Acc H1N1,F1 H1N1,Acc Flu,F1 Flu
Original,0.898048,0.898,0.791006,0.791
Normalized,0.898048,0.898,0.791006,0.791
Standardized,0.897694,0.898,0.791006,0.791


The Boosted Classifier models have similar metrics, but the standardized H1N1 model has lower accuracy than the others.

## Refining Models

### Combining optimal handling of null values, normalization/standardization, and running them on full datasets or only highly correlated features.

In [41]:
# Setting highly correlated variables
flu_correlated = ['h1n1_concern', 'h1n1_knowledge', 'behavior_wash_hands', 'behavior_touch_face', 'doctor_recc_h1n1', 'household_children', 'employment occupation']
h1n1_correlated = ['h1n1_concern', 'h1n1_knowledge', 'doctor_recc_h1n1', 'employment occupation']

In [71]:
# Normalizing the dropped H1N1 data
X_train_drop_h1n1_corr = X_train_drop_h1n1.filter(['h1n1_concern', 'h1n1_knowledge', 'doctor_recc_h1n1', 'employment_occupation'], axis=1)
X_test_drop_h1n1_corr = X_test_drop_h1n1.filter(['h1n1_concern', 'h1n1_knowledge', 'doctor_recc_h1n1', 'employment_occupation'], axis=1)

# Fitting a scaler on the training datasets
normh1n1_drop = MinMaxScaler().fit(X_train_drop_h1n1)
normh1n1_drop_corr = MinMaxScaler().fit(X_train_drop_h1n1_corr)

# Transforming the training datasets
X_train_drop_norm_h1n1 = normh1n1_drop.transform(X_train_drop_h1n1)
X_train_drop_norm_h1n1_corr = normh1n1_drop_corr.transform(X_train_drop_h1n1_corr)


# transform the testing dataset
X_test_drop_norm_h1n1 = normh1n1_drop.transform(X_test_drop_h1n1)
X_test_drop_norm_h1n1_corr = normh1n1_drop_corr.transform(X_test_drop_h1n1_corr)

In [43]:
# Standardizing the dropped H1N1 and Seasonal Data

# Copy the two datasets
X_train_drop_stand_h1n1 = X_train_drop_h1n1.copy()
X_train_drop_stand_seasonal = X_train_drop_seasonal.copy()
X_test_drop_stand_h1n1 = X_test_drop_h1n1.copy()
X_test_drop_stand_seasonal = X_test_drop_seasonal.copy()

# Apply standardization on the numerical features
for i in num_cols:
    
    # Fit the scaler on the training data column
    scale_drop_h1n1 = StandardScaler().fit(X_train_drop_stand_h1n1[[i]])
    scale_drop_seasonal = StandardScaler().fit(X_train_drop_stand_seasonal[[i]])
    
    # Transform the training data column
    X_train_drop_stand_h1n1[i] = scale_drop_h1n1.transform(X_train_drop_stand_h1n1[[i]])
    X_train_drop_stand_seasonal[i] = scale_drop_seasonal.transform(X_train_drop_stand_seasonal[[i]])
    
    # Transform the testing data column
    X_test_drop_stand_h1n1[i] = scale_drop_h1n1.transform(X_test_drop_stand_h1n1[[i]])
    X_test_drop_stand_seasonal[i] = scale_drop_seasonal.transform(X_test_drop_stand_seasonal[[i]])

In [80]:
# Creating correlated datasets
X_train_h1n1_corr = X_train_h1n1.filter(['h1n1_concern', 'h1n1_knowledge', 'doctor_recc_h1n1', 'employment_occupation'], axis=1)
X_train_seasonal_corr = X_train_seasonal.filter(['h1n1_concern', 'h1n1_knowledge', 'behavioral_wash_hands', 'behavioral_touch_face', 'doctor_recc_h1n1', 'household_children', 'employment_occupation'], axis=1)
X_test_h1n1_corr = X_test_h1n1.filter(['h1n1_concern', 'h1n1_knowledge', 'doctor_recc_h1n1', 'employment_occupation'], axis=1)
X_test_seasonal_corr = X_test_seasonal.filter(['h1n1_concern', 'h1n1_knowledge', 'behavioral_wash_hands', 'behavioral_touch_face', 'doctor_recc_h1n1', 'household_children', 'employment_occupation'], axis=1)

X_train_drop_stand_h1n1_corr = X_train_drop_stand_h1n1.filter(['h1n1_concern', 'h1n1_knowledge', 'doctor_recc_h1n1', 'employment_occupation'], axis=1)
X_train_drop_stand_seasonal_corr = X_train_drop_stand_seasonal.filter(['h1n1_concern', 'h1n1_knowledge', 'behavioral_wash_hands', 'behavioral_touch_face', 'doctor_recc_h1n1', 'household_children', 'employment_occupation'], axis=1)
X_test_drop_stand_h1n1_corr = X_test_drop_stand_h1n1.filter(['h1n1_concern', 'h1n1_knowledge', 'doctor_recc_h1n1', 'employment_occupation'], axis=1)
X_test_drop_stand_seasonal_corr = X_test_drop_stand_seasonal.filter(['h1n1_concern', 'h1n1_knowledge', 'behavioral_wash_hands', 'behavioral_touch_face', 'doctor_recc_h1n1', 'household_children', 'employment_occupation'], axis=1)

X_train_label_h1n1_corr = X_train_label_h1n1.filter(['h1n1_concern', 'h1n1_knowledge', 'doctor_recc_h1n1', 'employment_occupation'], axis=1)
X_train_label_seasonal_corr = X_train_label_seasonal.filter(['h1n1_concern', 'h1n1_knowledge', 'behavioral_wash_hands', 'behavioral_touch_face', 'doctor_recc_h1n1', 'household_children', 'employment_occupation'], axis=1)
X_test_label_h1n1_corr = X_test_label_h1n1.filter(['h1n1_concern', 'h1n1_knowledge', 'doctor_recc_h1n1', 'employment_occupation'], axis=1)
X_test_label_seasonal_corr = X_test_label_seasonal.filter(['h1n1_concern', 'h1n1_knowledge', 'behavioral_wash_hands', 'behavioral_touch_face', 'doctor_recc_h1n1', 'household_children', 'employment_occupation'], axis=1)

X_train_drop_h1n1_corr = X_train_drop_h1n1.filter(['h1n1_concern', 'h1n1_knowledge', 'doctor_recc_h1n1', 'employment_occupation'], axis=1)
X_train_drop_seasonal_corr = X_train_drop_seasonal.filter(['h1n1_concern', 'h1n1_knowledge', 'behavioral_wash_hands', 'behavioral_touch_face', 'doctor_recc_h1n1', 'household_children', 'employment_occupation'], axis=1)
X_test_drop_h1n1_corr = X_test_drop_h1n1.filter(['h1n1_concern', 'h1n1_knowledge', 'doctor_recc_h1n1', 'employment_occupation'], axis=1)
X_test_drop_seasonal_corr = X_test_drop_seasonal.filter(['h1n1_concern', 'h1n1_knowledge', 'behavioral_wash_hands', 'behavioral_touch_face', 'doctor_recc_h1n1', 'household_children', 'employment_occupation'], axis=1)

In [74]:
# Logistic Regression Model
logistic = LogisticRegressionCV(cv=5, penalty = 'l2', solver = 'liblinear',tol=1e-5,max_iter=1000,Cs=10, random_state = 12345)

acc_h1n1 = []
f1_h1n1 = []
acc_seasonal = []
f1_seasonal = []

# Splitting data into full and correlated datasets
trainX_h1n1 = [X_train_drop_stand_h1n1, X_train_drop_stand_h1n1_corr]
testX_h1n1 = [X_test_drop_stand_h1n1, X_test_drop_stand_h1n1_corr]

trainX_seasonal = [X_train_drop_stand_seasonal, X_train_drop_stand_seasonal_corr]
testX_seasonal = [X_test_drop_stand_seasonal, X_test_drop_stand_seasonal_corr]

for i in range(len(trainX_h1n1)):   
    # Model fitting
    model = logistic.fit(trainX_h1n1[i],y_train_h1n1)    
    # Model prediction
    pred = model.predict(testX_h1n1[i])
    acc_h1n1.append(accuracy_score(y_test_h1n1,pred))
    f1_h1n1.append(round(f1_score(y_test_h1n1, pred, average='macro'), 3))
    
for i in range(len(trainX_seasonal)):   
    # Model fitting
    model = logistic.fit(trainX_seasonal[i],y_train_seasonal)    
    # Model prediction
    pred = model.predict(testX_seasonal[i])
    # Metrics
    acc_seasonal.append(accuracy_score(y_test_seasonal,pred))
    f1_seasonal.append(round(f1_score(y_test_seasonal, pred, average='macro'), 3))
    

# Results    
df_logistic = pd.DataFrame({'Acc H1N1':acc_h1n1, 'F1 H1N1':f1_h1n1, 'Acc Flu':acc_seasonal, 'F1 Flu':f1_seasonal},index=['Original','Highly Correlated'])
df_logistic

Unnamed: 0,Acc H1N1,F1 H1N1,Acc Flu,F1 Flu
Original,0.498522,0.498,0.496078,0.482
Highly Correlated,0.512005,0.512,0.495904,0.44


The highly correlated models do better for H1N1, but not for the Seasonal Flu.

In [75]:
# Support Vector Machine Model
SVM = svm.LinearSVC(max_iter = 5000, penalty = 'l2', loss = 'hinge', random_state = 12345)

acc_h1n1 = []
f1_h1n1 = []
acc_seasonal = []
f1_seasonal = []

# Splitting data into full and correlated datasets
trainX_h1n1 = [X_train_drop_norm_h1n1, X_train_drop_norm_h1n1_corr]
testX_h1n1 = [X_test_drop_norm_h1n1, X_test_drop_norm_h1n1_corr]

trainX_seasonal = [X_train_drop_stand_seasonal, X_train_drop_stand_seasonal_corr]
testX_seasonal = [X_test_drop_stand_seasonal, X_test_drop_stand_seasonal_corr]

for i in range(len(trainX_h1n1)):   
    # Model fitting
    model = SVM.fit(trainX_h1n1[i],y_train_h1n1)    
    # Model prediction
    pred = model.predict(testX_h1n1[i])
    acc_h1n1.append(accuracy_score(y_test_h1n1,pred))
    f1_h1n1.append(round(f1_score(y_test_h1n1, pred, average='macro'), 3))
    
for i in range(len(trainX_seasonal)):   
    # Model fitting
    model = SVM.fit(trainX_seasonal[i],y_train_seasonal)    
    # Model prediction
    pred = model.predict(testX_seasonal[i])
    # Metrics
    acc_seasonal.append(accuracy_score(y_test_seasonal,pred))
    f1_seasonal.append(round(f1_score(y_test_seasonal, pred, average='macro'), 3))
    

# Results    
df_SVM = pd.DataFrame({'Acc H1N1':acc_h1n1, 'F1 H1N1':f1_h1n1, 'Acc Flu':acc_seasonal, 'F1 Flu':f1_seasonal},index=['Original','Highly Correlated'])
df_SVM



Unnamed: 0,Acc H1N1,F1 H1N1,Acc Flu,F1 Flu
Original,0.498167,0.498,0.490326,0.476
Highly Correlated,0.504317,0.47,0.493289,0.431


The original and highly correlated datasets have mixed metrics.

In [78]:
# Gradient Boosting Classifier
gb_classif = GradientBoostingClassifier()

acc_h1n1 = []
f1_h1n1 = []
acc_seasonal = []
f1_seasonal = []

# Splitting data into full and correlated datasets
trainX_h1n1 = [X_train_label_h1n1, X_train_label_h1n1_corr]
testX_h1n1 = [X_test_label_h1n1, X_test_label_h1n1_corr]

trainX_seasonal = [X_train_seasonal, X_train_seasonal_corr]
testX_seasonal = [X_test_seasonal, X_test_seasonal_corr]

for i in range(len(trainX_h1n1)):   
    # Model fitting
    model = gb_classif.fit(trainX_h1n1[i],y_train_h1n1)    
    # Model prediction
    pred = model.predict(testX_h1n1[i])
    acc_h1n1.append(accuracy_score(y_test_h1n1,pred))
    f1_h1n1.append(round(f1_score(y_test_h1n1, pred, average='macro'), 3))
    
for i in range(len(trainX_seasonal)):   
    # Model fitting
    model = gb_classif.fit(trainX_seasonal[i],y_train_seasonal)    
    # Model prediction
    pred = model.predict(testX_seasonal[i])
    # Metrics
    acc_seasonal.append(accuracy_score(y_test_seasonal,pred))
    f1_seasonal.append(round(f1_score(y_test_seasonal, pred, average='macro'), 3))
    

# Results    
df_gb_classif = pd.DataFrame({'Acc H1N1':acc_h1n1, 'F1 H1N1':f1_h1n1, 'Acc Flu':acc_seasonal, 'F1 Flu':f1_seasonal},index=['Original','Highly Correlated'])
df_gb_classif

Unnamed: 0,Acc H1N1,F1 H1N1,Acc Flu,F1 Flu
Original,0.497339,0.497,0.794666,0.795
Highly Correlated,0.500059,0.499,0.66533,0.665


Using the correlated features, the H1N1 has improved metrics while the Seasonal Flu models do better with the original data.

In [81]:
# Random Forest model
rf = RandomForestClassifier(max_depth=2, random_state = 12345)

acc_h1n1 = []
f1_h1n1 = []
acc_seasonal = []
f1_seasonal = []

# Splitting data into full and correlated datasets
trainX_h1n1 = [X_train_label_h1n1, X_train_label_h1n1_corr]
testX_h1n1 = [X_test_label_h1n1, X_test_label_h1n1_corr]

trainX_seasonal = [X_train_drop_seasonal, X_train_drop_seasonal_corr]
testX_seasonal = [X_test_drop_seasonal, X_test_drop_seasonal_corr]

for i in range(len(trainX_h1n1)):   
    # Model fitting
    model = rf.fit(trainX_h1n1[i],y_train_h1n1)    
    # Model prediction
    pred = model.predict(testX_h1n1[i])
    acc_h1n1.append(accuracy_score(y_test_h1n1,pred))
    f1_h1n1.append(round(f1_score(y_test_h1n1, pred, average='macro'), 3))
    
for i in range(len(trainX_seasonal)):   
    # Model fitting
    model = rf.fit(trainX_seasonal[i],y_train_seasonal)    
    # Model prediction
    pred = model.predict(testX_seasonal[i])
    # Metrics
    acc_seasonal.append(accuracy_score(y_test_seasonal,pred))
    f1_seasonal.append(round(f1_score(y_test_seasonal, pred, average='macro'), 3))
    

# Results    
df_rf = pd.DataFrame({'Acc H1N1':acc_h1n1, 'F1 H1N1':f1_h1n1, 'Acc Flu':acc_seasonal, 'F1 Flu':f1_seasonal},index=['Original','Highly Correlated'])
df_rf

Unnamed: 0,Acc H1N1,F1 H1N1,Acc Flu,F1 Flu
Original,0.501597,0.502,0.499216,0.459
Highly Correlated,0.503134,0.498,0.503225,0.469


The seasonal flu model does better with the correlated dataset, which is the opposite of most of the models, and the H1N1 doesn't have a significant difference with the highly correlated dataset. 

In [82]:
# Boosted Classifier
adaboost = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 2), learning_rate = 1.5, n_estimators=400, random_state = 12345)

acc_h1n1 = []
f1_h1n1 = []
acc_seasonal = []
f1_seasonal = []

# Splitting data into full and correlated datasets
trainX_h1n1 = [X_train_h1n1, X_train_h1n1_corr]
testX_h1n1 = [X_test_h1n1, X_test_h1n1_corr]

trainX_seasonal = [X_train_seasonal, X_train_seasonal_corr]
testX_seasonal = [X_test_seasonal, X_test_seasonal_corr]

for i in range(len(trainX_h1n1)):   
    # Model fitting
    model = adaboost.fit(trainX_h1n1[i],y_train_h1n1)    
    # Model prediction
    pred = model.predict(testX_h1n1[i])
    acc_h1n1.append(accuracy_score(y_test_h1n1,pred))
    f1_h1n1.append(round(f1_score(y_test_h1n1, pred, average='macro'), 3))
    
for i in range(len(trainX_seasonal)):   
    # Model fitting
    model = adaboost.fit(trainX_seasonal[i],y_train_seasonal)    
    # Model prediction
    pred = model.predict(testX_seasonal[i])
    # Metrics
    acc_seasonal.append(accuracy_score(y_test_seasonal,pred))
    f1_seasonal.append(round(f1_score(y_test_seasonal, pred, average='macro'), 3))
    

# Results    
df_adaboost = pd.DataFrame({'Acc H1N1':acc_h1n1, 'F1 H1N1':f1_h1n1, 'Acc Flu':acc_seasonal, 'F1 Flu':f1_seasonal},index=['Original','Highly Correlated'])
df_adaboost

Unnamed: 0,Acc H1N1,F1 H1N1,Acc Flu,F1 Flu
Original,0.898048,0.898,0.791006,0.791
Highly Correlated,0.85618,0.856,0.662018,0.662


The metrics are better with the original data.

Overall, combining null values and normalization/standardization types lowers the metrics for the models. The top two H1N1 models are the Encoded Gradient Boosted Classifier model (Accuracy: 0.9069) and the Gradient Boosted Classifier model without adjustments (Accuracy: 0.904). The top two Seasonal Flu models are the Dropped NA Gradient Boosted Classifier Model (Accuracy: 0.8055) and the Gradient Boosted Classifier model without adjustments (Accuracy: 0.7951). 

In [None]:
# Find best paramters of Adaboost TODO using random search instead of grid search
boost_grid = {
    'n_estimators': [50, 100, 200, 400],
    'learning_rate': [0.01, 0.1, 0.5, 1.0, 1.5, 2.0]
}

boost_gridSearch = GridSearchCV(AdaBoostClassifier(), boost_grid, cv = 3)
boost_gridSearch.fit(X_train, y_train_h1n1)

print('Initial Adaboost Parameters:', boost_gridSearch.best_params_)

In [None]:
# Create plot of the confusion matrix, showing performance of the model
cm = confusion_matrix(y_test_h1n1, y_pred)
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     cm.flatten()/np.sum(cm)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cm, annot=labels, fmt='', cmap='Blues')