In [18]:
import pandas as pd
from utils.functions import *
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [19]:
df = pd.read_csv("data/training_h1n1.csv")
df = df.dropna()

In [20]:
print(df.columns)
print(df.dtypes)
print(df.describe())
print(df.shape)

Index(['respondent_id', 'h1n1_concern', 'h1n1_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face',
       ...
       'qxajmpny', 'rcertsgn', 'tfqavkke', 'ukymxvdu', 'uqqtjvyb', 'vlluhbov',
       'xgwztkwe', 'xqwwgdyp', 'xtkaffoo', 'xzmlyyjv'],
      dtype='object', length=104)
respondent_id                  int64
h1n1_concern                 float64
h1n1_knowledge               float64
behavioral_antiviral_meds    float64
behavioral_avoidance         float64
                              ...   
vlluhbov                       int64
xgwztkwe                       int64
xqwwgdyp                       int64
xtkaffoo                       int64
xzmlyyjv                       int64
Length: 104, dtype: object
       respondent_id  h1n1_concern  h1n1_knowledge  behavioral_antiviral_meds  \
count   13104.000000  13104.000000

In [21]:
# types = ["int64","float64"]
# categorical = []
# numerical = []
#
# for column in df.columns:
#     if seperate_columns_by_type(df[column], types):
#         numerical.append(column)
#     else:
#         categorical.append(column)

In [22]:
# for column in categorical[:-1]:
#     sns.catplot(kind="count", data=df, x=column, hue=categorical[-1])
#     plt.title(f"Countplot on {column}")
#     plt.xticks(rotation=45)
#     plt.show()

Splitting into train and test

In [23]:
target_column = "h1n1_vaccine"
y = df[target_column].values
X = df.drop([target_column,"respondent_id"], axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,
                                                    random_state=1,stratify=y)

scaler = StandardScaler()
X_train_standard = scaler.fit_transform(X_train)
X_test_standard = scaler.transform(X_test)

## Trying out different models for predictions
### Linear regression

In [24]:
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.linear_model import LinearRegression
#
# degree=2
# model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
# model.fit(X_train_standard, y_train)
# model_score = model.score(X_train,y_train)
# print(model_score)

### Random Forest # TROUBLE WITH 1's

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

model = RandomForestClassifier(random_state=40)
model.fit(X_train_standard, y_train)
y_pred = model.predict(X_test_standard)
cv_score = cross_val_score(model,X_train, y_train,cv=100)
print("Model used is Random Forest:")
print(f"cv_score   : {np.mean(cv_score)}")
print(f"train_score: {model.score(X_train_standard, y_train)}")
print(f"test_score : {model.score(X_test_standard, y_test)}")
print(classification_report(y_test, y_pred))

Model used is Random Forest:
cv_score   : 0.8212591575091576
train_score: 1.0
test_score : 0.8222052651659671
              precision    recall  f1-score   support

           0       0.84      0.93      0.88      1839
           1       0.78      0.57      0.66       782

    accuracy                           0.82      2621
   macro avg       0.81      0.75      0.77      2621
weighted avg       0.82      0.82      0.81      2621



In [28]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dropout, Dense, Flatten
from keras.metrics import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.models import load_model

physical_devices = tf.config.experimental.list_physical_devices("GPU")
print("NUM GPUs Available: ", len(physical_devices))
tf.config.experimental.set_memory_growth(physical_devices[0], True)

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
mc = ModelCheckpoint('models/h1z1_training_{accuracy:.4f}_{val_accuracy:.4f}.h5',
                     monitor='val_accuracy', mode='max', verbose=1,
                     save_best_only=True)
lrl = ReduceLROnPlateau(patience=10,verbose=1,monitor="loss")

print(X_train_standard.shape)
model = Sequential([
    Dense(units=16, input_shape=(102,), activation="relu"),
    Dense(units=16, activation="relu"),
    Dropout(0.2),
    Dense(units=32, activation="relu"),
    Dropout(0.2),
    Dense(units=8, activation="relu"),
    Dense(units=1, activation="sigmoid")
])

learning_rate = 0.001
loss = "binary_crossentropy"
metrics = ["accuracy"]


model.compile(optimizer=Adam(learning_rate=learning_rate),
              loss=loss, metrics=metrics)
history = model.fit(X_train_standard, y_train, batch_size=100,epochs=200,
          shuffle=True, verbose=2, validation_split=0.5,
                    callbacks=[es,mc,lrl])

NUM GPUs Available:  1
(10483, 102)
Epoch 1/200
53/53 - 3s - loss: 0.6420 - accuracy: 0.6737 - val_loss: 0.6020 - val_accuracy: 0.6980

Epoch 00001: val_accuracy improved from -inf to 0.69802, saving model to models\best_training_0.6737_0.6980.h5
Epoch 2/200
53/53 - 1s - loss: 0.5753 - accuracy: 0.7050 - val_loss: 0.5603 - val_accuracy: 0.6980

Epoch 00002: val_accuracy did not improve from 0.69802
Epoch 3/200
53/53 - 1s - loss: 0.5392 - accuracy: 0.7050 - val_loss: 0.5300 - val_accuracy: 0.6980

Epoch 00003: val_accuracy did not improve from 0.69802
Epoch 4/200
53/53 - 1s - loss: 0.5072 - accuracy: 0.7050 - val_loss: 0.5103 - val_accuracy: 0.6980

Epoch 00004: val_accuracy did not improve from 0.69802
Epoch 5/200
53/53 - 1s - loss: 0.4848 - accuracy: 0.7188 - val_loss: 0.4988 - val_accuracy: 0.7551

Epoch 00005: val_accuracy improved from 0.69802 to 0.75506, saving model to models\best_training_0.7188_0.7551.h5
Epoch 6/200
53/53 - 1s - loss: 0.4732 - accuracy: 0.7749 - val_loss: 0.491

In [29]:
best_model = keep_best_saved_h5("/models","h1z1_training_",.02)
loaded_model = load_model(f"models/{best_model}")

Currently in directory:C:\Users\samyn\Desktop\GNT-Arai-31\Personal projects\drivendata_flu_shot_learning
File coming out of the function: best_training_0.8102_0.7921.h5


In [30]:
print(confusion_matrix(y_test,np.round(loaded_model.predict(X_test_standard))))
print(loaded_model.evaluate(X_test_standard, y_test))

[[1707  132]
 [ 383  399]]
[0.46703094244003296, 0.8035101294517517]
