# H1N1 and Seasonal Flu Vaccines

In [None]:
import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV

from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import roc_auc_score, plot_roc_curve, confusion_matrix, plot_confusion_matrix

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#from sklearn.linear_model import LogisticRegression

## Data Prep

In [None]:
# import data

features_df = pd.read_csv("../data/Training_Features.csv", index_col="respondent_id")
labels_df = pd.read_csv("../data/Training_Labels.csv", index_col="respondent_id")
df = features_df.join(labels_df)

In [None]:
# define columns to drop before training

cols_to_drop = [
    # high nulls
    'health_insurance',
    # demographics
    'age_group',
    'education',
    'race',
    'sex',
    'marital_status',
    'employment_status',
    'employment_industry',
    'employment_occupation',
    'income_poverty',
    'rent_or_own',
    'hhs_geo_region',
    'census_msa',
    'household_adults',
    'household_children',
    # targets
    'h1n1_vaccine',
    'seasonal_vaccine'
]


In [None]:
# define X and y 
X = df.drop(columns= cols_to_drop)
y = df['h1n1_vaccine']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
num_cols = X.columns[X.dtypes != 'object'].to_list()
cat_cols = X.columns[X.dtypes == 'object'].to_list()

### Preprocessing

In [None]:
# impute and scale

num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

# cat_transformer = Pipeline([  ])
# if you don't drop demo columns before training, add transformer to one-hot-encode categorical variables


In [None]:
preprocessor = ColumnTransformer(
    transformers =[
        ("numeric_transformer", num_transformer, num_cols)
        # ("categorical_transformer", cat_transformer, cat_cols)
        ])

## Dummy Model

accuracy score - 79%

In [None]:
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train, y_train)
dummy.score(X_train, y_train)

In [None]:
y_pred_dummy = dummy.predict(X_test)
accuracy_score(y_test, y_pred_dummy)

In [None]:
dummy_a = cross_val_score(dummy, X_train, y_train, scoring='accuracy').mean()

## Decision Tree

In [None]:
dt = DecisionTreeClassifier(random_state=42)

In [None]:
dt_untuned = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', dt)
])

In [None]:
cross_validate(dt_untuned, X_train, y_train, return_train_score=True, scoring=['accuracy', 'precision'])

In [None]:
dt1_a = cross_val_score(dt_untuned, X_train, y_train, scoring='accuracy').mean()
dt1_p = cross_val_score(dt_untuned, X_train, y_train, scoring='precision').mean()

### Grid Search

In [None]:
# first run

dt_param_grid = {
    "classifier__class_weight": ['balanced', None],
    "classifier__max_depth": [2, 5, 15],
    "classifier__min_samples_leaf": [10, 15, 20],
    "classifier__min_samples_split": [2, 5, 7],
}

In [None]:
# second run

dt_param_grid = {
    "classifier__class_weight": ['balanced', None],
    "classifier__max_depth": [2, 5, 8, 13],
    "classifier__min_samples_leaf": [8, 10, 14],
    "classifier__min_samples_split": [2, 5, 10],
}

In [None]:
# third run 

dt_param_grid = {
    "classifier__class_weight": ['balanced', None],
    "classifier__max_depth": [2, 3, 4],
    "classifier__min_samples_leaf": [5, 7, 9],
    "classifier__min_samples_split": [2, 4, 6],
}

In [None]:
dt_grid = GridSearchCV(dt_clf, dt_param_grid, scoring="precision")
dt_grid.fit(X_train, y_train)
dt_grid.best_params_

### Tuned

In [None]:
dt2 = DecisionTreeClassifier(
    random_state=42,
    class_weight=None,
    max_depth=2,
    min_samples_leaf=5,
    min_samples_split=2
)

In [None]:
dt_tuned = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', dt2)
])

In [None]:
cross_validate(dt_tuned, X_train, y_train, return_train_score=True, scoring=['accuracy', 'precision'])

In [None]:
dt2_a = cross_val_score(dt_tuned, X_train, y_train, scoring='accuracy').mean()
dt2_p = cross_val_score(dt_tuned, X_train, y_train, scoring='precision').mean()

## Random Forest

In [None]:
rf = RandomForestClassifier(random_state=42)

In [None]:
rf_untuned = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', rf)
])

In [None]:
cross_validate(rf_untuned, X_train, y_train, return_train_score=True, scoring=['accuracy', 'precision'])

In [None]:
rf1_a = cross_val_score(rf_untuned, X_train, y_train, scoring='accuracy').mean()
rf1_p = cross_val_score(rf_untuned, X_train, y_train, scoring='precision').mean()

### Grid Search

In [None]:
# first run - 13 min

param_grid = {
    "classifier__class_weight": ['balanced', None], # none
    "classifier__max_depth": [2, 5, 8, 13], # 13
    "classifier__min_samples_leaf": [8, 10, 14], # 8
    "classifier__min_samples_split": [2, 5, 10],# 2
    "classifier__n_estimators": [100, 200, 300] # 200
}

In [None]:
# second run - 14 min

rf_param_grid = {
    "classifier__class_weight": ['balanced', None], # none
    "classifier__max_depth": [8, 13, 17], # 8
    "classifier__min_samples_leaf": [5, 8, 11], # 8
    "classifier__min_samples_split": [2, 4, 6], # 2
    "classifier__n_estimators": [150, 200, 250] # 200
}

In [None]:
grid = GridSearchCV(rf_untuned, rf_param_grid, scoring="precision")
grid.fit(X_train, y_train)
grid.best_params_

### Tuned

In [None]:
rf2 = RandomForestClassifier(
    random_state=42,
    class_weight=None,
    max_depth=8,
    min_samples_leaf=8,
    min_samples_split=2,
    n_estimators=200
    )

In [None]:
rf_tuned = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', rf2)
])

In [None]:
cross_validate(rf_tuned, X_train, y_train, return_train_score=True, scoring=['accuracy', 'precision'])

In [None]:
rf2_a = cross_val_score(rf_tuned, X_train, y_train, scoring='accuracy').mean()
rf2_p = cross_val_score(rf_tuned, X_train, y_train, scoring='precision').mean()

## Final Model

In [None]:
final = rf_tuned

In [None]:
final.fit(X_train, y_train)
y_pred_final = final.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_final)*100)
print("Precision:", precision_score(y_test, y_pred_final)*100)

In [None]:
models = ['Dummy', 'Decision Tree', 'Decision Tree (Tuned)', 'Random Forest', 'Random Forest (Tuned)']
accuracy_scores = [dummy_a, dt1_a, dt2_a, rf1_a, rf2_a]
precision_scores = [0, dt1_p, dt2_p, rf1_p, rf2_p]

fig, ax = plt.subplots(figsize=(10, 6))

bar_width = 0.3

bar_positions1 = np.arange(len(models))
bar_positions2 = bar_positions1 + bar_width

ax.bar(bar_positions1, accuracy_scores, bar_width, label='Accuracy')
ax.bar(bar_positions2, precision_scores, bar_width, label='Precision')

ax.set_title('Model Performance')
ax.set_xticks(bar_positions1 + bar_width / 2)
ax.set_xticklabels(models)
ax.legend();


In [None]:
cm = confusion_matrix(y_test, y_pred_final)
cm_percent = cm / np.sum(cm) * 100

# Plot heatmap for final model's confusion matrix for better visualization 
fig, ax = plt.subplots(figsize=(8,7))
sns.heatmap(cm_percent, annot=True, fmt=".2f", cbar=False, cmap="Blues");  

ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")
ax.xaxis.set_ticklabels(['Not Vaxxed', 'Vaxxed'])
ax.yaxis.set_ticklabels(['Not Vaxxed', 'Vaxxed'], rotation = 0);