In [62]:
import pandas as pd
from utils.functions import *
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [63]:
df = pd.read_csv("data/training_h1n1.csv")
df = df.dropna()

In [64]:
print(df.columns)
print(df.dtypes)
print(df.describe())
print(df.shape)

Index(['respondent_id', 'h1n1_concern', 'h1n1_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face',
       ...
       'qxajmpny', 'rcertsgn', 'tfqavkke', 'ukymxvdu', 'uqqtjvyb', 'vlluhbov',
       'xgwztkwe', 'xqwwgdyp', 'xtkaffoo', 'xzmlyyjv'],
      dtype='object', length=104)
respondent_id                  int64
h1n1_concern                 float64
h1n1_knowledge               float64
behavioral_antiviral_meds    float64
behavioral_avoidance         float64
                              ...   
vlluhbov                       int64
xgwztkwe                       int64
xqwwgdyp                       int64
xtkaffoo                       int64
xzmlyyjv                       int64
Length: 104, dtype: object
       respondent_id  h1n1_concern  h1n1_knowledge  behavioral_antiviral_meds  \
count   13104.000000  13104.000000

In [65]:
# types = ["int64","float64"]
# categorical = []
# numerical = []
#
# for column in df.columns:
#     if seperate_columns_by_type(df[column], types):
#         numerical.append(column)
#     else:
#         categorical.append(column)

In [66]:
# for column in categorical[:-1]:
#     sns.catplot(kind="count", data=df, x=column, hue=categorical[-1])
#     plt.title(f"Countplot on {column}")
#     plt.xticks(rotation=45)
#     plt.show()

Splitting into train and test

In [81]:
target_column = "h1n1_vaccine"
y = df[target_column].values
X = df.drop([target_column,"respondent_id"], axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,
                                                    random_state=1,stratify=y)

scaler = StandardScaler()
X_train_standard = scaler.fit_transform(X_train)

## Trying out different models for predictions
### Linear regression

In [82]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

degree=2
model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
model.fit(X_train_standard, y_train)
model_score = model.score(X_train,y_train)
print(model_score)

-1.262155999393191e+21


### Random Forest # TROUBLE WITH 1's

In [83]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

model = RandomForestClassifier(random_state=40)
model.fit(X_train_standard, y_train)
y_pred = model.predict(X_test)
cv_score = cross_val_score(model,X_train, y_train,cv=100)
print("Model used is Random Forest:")
print(f"cv_score   : {np.mean(cv_score)}")
print(f"train_score: {model.score(X_train, y_train)}")
print(f"test_score : {model.score(X_test, y_test)}")
print(classification_report(y_test, y_pred))

Model used is Random Forest:
cv_score   : 0.8212591575091576
train_score: 0.3692645235142612
test_score : 0.3693246852346433
              precision    recall  f1-score   support

           0       0.89      0.11      0.20      1839
           1       0.32      0.97      0.48       782

    accuracy                           0.37      2621
   macro avg       0.61      0.54      0.34      2621
weighted avg       0.72      0.37      0.29      2621



In [104]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dropout, Dense, Flatten
from keras.metrics import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.models import load_model

physical_devices = tf.config.experimental.list_physical_devices("GPU")
print("NUM GPUs Available: ", len(physical_devices))
tf.config.experimental.set_memory_growth(physical_devices[0], True)

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
mc = ModelCheckpoint('models/best_training_{val_loss:.4f}.h5',
                     monitor='val_accuracy', mode='max', verbose=1,
                     save_best_only=True)
lrl = ReduceLROnPlateau(patience=10,verbose=1,monitor="loss")

print(X_train_standard.shape)
model = Sequential([
    Dense(units=16, input_shape=(102,), activation="relu"),
    Dense(units=16, activation="relu"),
    Dropout(0.2),
    Dense(units=32, activation="relu"),
    Dropout(0.2),
    Dense(units=8, activation="relu"),
    Dense(units=1, activation="sigmoid")
])

learning_rate = 0.001
loss = "binary_crossentropy"
metrics = ["accuracy"]


model.compile(optimizer=Adam(learning_rate=learning_rate),
              loss=loss, metrics=metrics)
history = model.fit(X_train,y_train,batch_size=100,epochs=200,
          shuffle=True, verbose=2, validation_split=0.2,
                    callbacks=[es,mc,lrl])

NUM GPUs Available:  1
(10483, 102)
Epoch 1/200
84/84 - 1s - loss: 0.6430 - accuracy: 0.6377 - val_loss: 0.5723 - val_accuracy: 0.7110

Epoch 00001: val_accuracy improved from -inf to 0.71102, saving model to models\best_training_0.5723.h5
Epoch 2/200
84/84 - 0s - loss: 0.5417 - accuracy: 0.7355 - val_loss: 0.4683 - val_accuracy: 0.7902

Epoch 00002: val_accuracy improved from 0.71102 to 0.79018, saving model to models\best_training_0.4683.h5
Epoch 3/200
84/84 - 0s - loss: 0.4838 - accuracy: 0.7762 - val_loss: 0.4296 - val_accuracy: 0.8054

Epoch 00003: val_accuracy improved from 0.79018 to 0.80544, saving model to models\best_training_0.4296.h5
Epoch 4/200
84/84 - 0s - loss: 0.4514 - accuracy: 0.7975 - val_loss: 0.4144 - val_accuracy: 0.8202

Epoch 00004: val_accuracy improved from 0.80544 to 0.82022, saving model to models\best_training_0.4144.h5
Epoch 5/200
84/84 - 0s - loss: 0.4409 - accuracy: 0.8050 - val_loss: 0.4129 - val_accuracy: 0.8159

Epoch 00005: val_accuracy did not impro

In [106]:
best_model = load_model("models/best_training_0.4025.h5")
print(confusion_matrix(y_test,np.round(best_model.predict(X_test))))
print(best_model.evaluate(X_test, y_test))

[[1677  162]
 [ 300  482]]
[0.40334412455558777, 0.8237314224243164]
