In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.model_selection import validation_curve

In [3]:
ML_data = pd.read_csv('data/ML_ready_train')

In [4]:
ML_data['household_total'] = ML_data['household_children'] + ML_data['household_adults']

In [5]:
X = ML_data.drop(columns=['respondent_id','seasonal_vaccine', 'h1n1_vaccine'])
yh = ML_data.h1n1_vaccine
ys = ML_data.seasonal_vaccine


## H1N1 Train/Test Split:

In [6]:
Xh_train, Xh_test, yh_train, yh_test = train_test_split(X, yh, test_size=0.3, random_state=42)

## Seasonal Train/Test Split:

In [7]:
Xs_train, Xs_test, ys_train, ys_test = train_test_split(X, ys, test_size=0.3, random_state=42)

# Random Forest Classifier (Best Option):

In [8]:
rfc = RandomForestClassifier()
rfc.fit(Xs_train, ys_train)

ys_pred_rfc = rfc.predict(Xs_test)

cnf_matrix = confusion_matrix(ys_test, ys_pred_rfc)
print(cnf_matrix)
Accuracy_rfc = rfc.score(Xs_test,ys_test)
print(Accuracy_rfc)

[[3568  774]
 [ 945 2726]]
0.7854736053912392


In [9]:
rfc2 = RandomForestClassifier()
rfc2.fit(Xh_train, yh_train)

yh_pred_rfc = rfc2.predict(Xh_test)

cnf_matrix2 = confusion_matrix(yh_test, yh_pred_rfc)
print(cnf_matrix2)
Accuracy_rfc2 = rfc2.score(Xh_test,yh_test)
print(Accuracy_rfc2)

[[6089  230]
 [ 965  729]]
0.8508673405715712


In [10]:
print(classification_report(yh_test, yh_pred_rfc))

              precision    recall  f1-score   support

           0       0.86      0.96      0.91      6319
           1       0.76      0.43      0.55      1694

    accuracy                           0.85      8013
   macro avg       0.81      0.70      0.73      8013
weighted avg       0.84      0.85      0.83      8013



In [11]:
print(classification_report(ys_test, ys_pred_rfc))

              precision    recall  f1-score   support

           0       0.79      0.82      0.81      4342
           1       0.78      0.74      0.76      3671

    accuracy                           0.79      8013
   macro avg       0.78      0.78      0.78      8013
weighted avg       0.79      0.79      0.78      8013



In [12]:
roc_auc_score(yh_test, yh_pred_rfc)

0.6969721103106707

In [13]:
roc_auc_score(ys_test, ys_pred_rfc)

0.7821590438133436

In [15]:
importances = list(rfc.feature_importances_)

tab=pd.DataFrame(list(zip(X,importances)),columns =['Features', 'Importance scores']) 
imp = tab.sort_values(by='Importance scores', ascending=False)
imp.head(10)

Unnamed: 0,Features,Importance scores
17,opinion_seas_vacc_effective,0.086996
18,opinion_seas_risk,0.08486
10,doctor_recc_seasonal,0.06271
20,age_group,0.05572
15,opinion_h1n1_risk,0.034237
14,opinion_h1n1_vacc_effective,0.03153
19,opinion_seas_sick_from_vacc,0.028256
0,h1n1_concern,0.027309
21,education,0.026633
93,household_total,0.026233


In [23]:
importances = list(rfc2.feature_importances_)

tab=pd.DataFrame(list(zip(X,importances)),columns =['Features', 'Importance scores']) 
imp = tab.sort_values(by='Importance scores', ascending=False)
imp.head(10)

Unnamed: 0,Features,Importance scores
9,doctor_recc_h1n1,0.084293
15,opinion_h1n1_risk,0.069485
14,opinion_h1n1_vacc_effective,0.054149
18,opinion_seas_risk,0.042579
32,health_insurance_unknown,0.032333
31,health_insurance_1.0,0.031103
20,age_group,0.030636
17,opinion_seas_vacc_effective,0.027349
16,opinion_h1n1_sick_from_vacc,0.027081
0,h1n1_concern,0.026932


In [28]:
X2 = data.drop(columns=['respondent_id', 'seasonal_vaccine'])
y2s = data.seasonal_vaccine


X2s_train, X2s_test, y2s_train, y2s_test = train_test_split(X2, y2s, test_size=0.2, random_state=42)


RFC = RandomForestClassifier()
RFC.fit(X2s_train, y2s_train)

y2_preds = RFC.predict(X2s_test)

RFC.score(X2s_test, y2s_test)

0.8038187944590042

In [31]:
importances = list(RFC.feature_importances_)

tab=pd.DataFrame(list(zip(X2,importances)),columns =['Features', 'Importance scores']) 
imp = tab.sort_values(by='Importance scores', ascending=False)
imp.head(10)

Unnamed: 0,Features,Importance scores
35,h1n1_vaccine,0.0863
18,opinion_seas_vacc_effective,0.083255
19,opinion_seas_risk,0.082435
10,doctor_recc_seasonal,0.069074
21,age_group,0.059735
29,hhs_geo_region,0.047873
33,employment_industry,0.033284
34,employment_occupation,0.033149
16,opinion_h1n1_risk,0.031974
15,opinion_h1n1_vacc_effective,0.03037


In [32]:
roc_auc_score(y2s_test, y2_preds)

0.8014852436005833

# Logistic Regression:

In [64]:
Logreg = LogisticRegression(C = 10,random_state = 42, max_iter=1000)
Logreg.fit(Xs_train,ys_train)
    
ys_pred_lr = Logreg.predict(Xs_test)

print(confusion_matrix(ys_test, ys_pred_lr))
    
print(Logreg.score(Xs_test, ys_test))

[[3577  765]
 [ 937 2734]]
0.7875951578684638


In [65]:
Logreg2 = LogisticRegression(C = 10,random_state = 42, max_iter=1000)
Logreg2.fit(Xh_train,yh_train)
    
yh_pred_lr = Logreg2.predict(Xh_test)

print(confusion_matrix(yh_test, yh_pred_lr))
    
print(Logreg2.score(Xh_test, yh_test))

[[5991  328]
 [ 879  815]]
0.8493697741170598


In [18]:
print(classification_report(yh_test, yh_pred_lr))

              precision    recall  f1-score   support

           0       0.87      0.95      0.91      4212
           1       0.72      0.49      0.58      1130

    accuracy                           0.85      5342
   macro avg       0.80      0.72      0.74      5342
weighted avg       0.84      0.85      0.84      5342



In [19]:
print(classification_report(ys_test, ys_pred_lr))

              precision    recall  f1-score   support

           0       0.79      0.82      0.80      2891
           1       0.78      0.75      0.76      2451

    accuracy                           0.79      5342
   macro avg       0.78      0.78      0.78      5342
weighted avg       0.78      0.79      0.78      5342



In [66]:
roc_auc_score(yh_test, yh_pred_lr)

0.7146014259949146

In [67]:
roc_auc_score(ys_test, ys_pred_lr)

0.7842850539308618

# Support Vector Machine:

In [68]:
svm = SVC(kernel='linear')
svm.fit(Xs_train, ys_train)

ys_predict_svm=svm.predict(Xs_test)



cnf_matrix = confusion_matrix(ys_test, ys_predict_svm)
print(cnf_matrix)

Accuracy_svm=svm.score(Xs_test,ys_test)
print(Accuracy_svm)

[[3600  742]
 [ 977 2694]]
0.7854736053912392


In [69]:
svm2 = SVC(kernel='linear')
svm2.fit(Xh_train, yh_train)

yh_predict_svm=svm2.predict(Xh_test)



cnf_matrix2 = confusion_matrix(yh_test, yh_predict_svm)
print(cnf_matrix2)

Accuracy_svm2=svm2.score(Xh_test,yh_test)
print(Accuracy_svm2)

[[6024  295]
 [ 910  784]]
0.8496193685261451


In [24]:
print(classification_report(yh_test, yh_predict_svm))

              precision    recall  f1-score   support

           0       0.87      0.95      0.91      4212
           1       0.73      0.47      0.57      1130

    accuracy                           0.85      5342
   macro avg       0.80      0.71      0.74      5342
weighted avg       0.84      0.85      0.84      5342



In [25]:
print(classification_report(ys_test, ys_predict_svm))

              precision    recall  f1-score   support

           0       0.79      0.82      0.81      2891
           1       0.78      0.74      0.76      2451

    accuracy                           0.79      5342
   macro avg       0.79      0.78      0.78      5342
weighted avg       0.79      0.79      0.79      5342



In [71]:
roc_auc_score(yh_test, yh_predict_svm)

0.7080626576806928

In [70]:
roc_auc_score(ys_test, ys_predict_svm)

0.7814854962037034

# Gradient Boosting:

In [131]:
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
gbc = GradientBoostingClassifier(learning_rate=0.05 , n_estimators=200, random_state=5, max_depth=9, max_leaf_nodes=100)
gbc.fit(Xs_train, ys_train)


ys_predict_gbc = gbc.predict(Xs_test)


print(confusion_matrix(ys_test, ys_predict_gbc))

print(gbc.score(Xs_test,ys_test))


[[2367  524]
 [ 566 1885]]
0.7959565705728192


In [29]:
gbc2 = GradientBoostingClassifier(subsample=0.8, learning_rate=0.05 , n_estimators=160, random_state=5, max_depth=9, max_leaf_nodes=100)
gbc2.fit(Xh_train, yh_train)


yh_predict_gbc = gbc2.predict(Xh_test)


print(confusion_matrix(yh_test, yh_predict_gbc))

print(gbc2.score(Xh_test,yh_test))

[[3963  249]
 [ 554  576]]
0.8496817671284164


In [30]:
print(classification_report(yh_test, yh_predict_gbc))

              precision    recall  f1-score   support

           0       0.88      0.94      0.91      4212
           1       0.70      0.51      0.59      1130

    accuracy                           0.85      5342
   macro avg       0.79      0.73      0.75      5342
weighted avg       0.84      0.85      0.84      5342



In [31]:
print(classification_report(ys_test, ys_predict_gbc))

              precision    recall  f1-score   support

           0       0.80      0.81      0.81      2891
           1       0.78      0.77      0.77      2451

    accuracy                           0.79      5342
   macro avg       0.79      0.79      0.79      5342
weighted avg       0.79      0.79      0.79      5342



In [32]:
roc_auc_score(yh_test, yh_predict_gbc)

0.7253088520787636

In [33]:
roc_auc_score(ys_test, ys_predict_gbc)

0.7899994510178819

# KNN:

In [34]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(p=2,weights='distance',n_neighbors=50)
knn.fit(Xs_train,ys_train)

ys_predict_knn=knn.predict(Xs_test)


print(confusion_matrix(ys_test, ys_predict_knn))

print(knn.score(Xs_test,ys_test))

[[2311  580]
 [ 612 1839]]
0.7768625982777986


In [35]:
knn2 = KNeighborsClassifier(p=2,weights='distance',n_neighbors=50)
knn2.fit(Xh_train,yh_train)


yh_predict_knn=knn2.predict(Xh_test)


print(confusion_matrix(yh_test, yh_predict_knn))

print(knn2.score(Xh_test,yh_test))

[[4093  119]
 [ 757  373]]
0.8360164732309996


In [36]:
print(classification_report(yh_test, yh_predict_knn))

              precision    recall  f1-score   support

           0       0.84      0.97      0.90      4212
           1       0.76      0.33      0.46      1130

    accuracy                           0.84      5342
   macro avg       0.80      0.65      0.68      5342
weighted avg       0.83      0.84      0.81      5342



In [37]:
print(classification_report(ys_test, ys_predict_knn))

              precision    recall  f1-score   support

           0       0.79      0.80      0.79      2891
           1       0.76      0.75      0.76      2451

    accuracy                           0.78      5342
   macro avg       0.78      0.77      0.78      5342
weighted avg       0.78      0.78      0.78      5342



In [38]:
roc_auc_score(yh_test, yh_predict_knn)

0.6509179419946381

In [39]:
roc_auc_score(ys_test, ys_predict_knn)

0.7748416878109459

# SGD:

In [40]:
from sklearn.linear_model import SGDClassifier

In [41]:
sgd = SGDClassifier()
sgd.fit(Xs_train, ys_train)

ys_pred_sgd = sgd.predict(Xs_test)

sgd.score(Xs_test, ys_test)

0.7821040808685885

In [71]:
roc_auc_score(ys_test, ys_pred_sgd)

0.7805536421153114

In [43]:
sgd2 = SGDClassifier()
sgd2.fit(Xh_train, yh_train)

yh_pred_sgd = sgd2.predict(Xh_test)

sgd2.score(Xh_test, yh_test)

0.8444402845376263

In [44]:
roc_auc_score(yh_test, yh_pred_sgd)

0.6698581381472237

In [45]:
print(classification_report(ys_test, ys_pred_sgd))

              precision    recall  f1-score   support

           0       0.80      0.80      0.80      2891
           1       0.76      0.76      0.76      2451

    accuracy                           0.78      5342
   macro avg       0.78      0.78      0.78      5342
weighted avg       0.78      0.78      0.78      5342



In [46]:
print(classification_report(yh_test, yh_pred_sgd))

              precision    recall  f1-score   support

           0       0.85      0.97      0.91      4212
           1       0.78      0.37      0.50      1130

    accuracy                           0.84      5342
   macro avg       0.82      0.67      0.70      5342
weighted avg       0.84      0.84      0.82      5342

