In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sys
import os

In [2]:
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
from keras.models import Sequential
from keras.layers import Dense

Set notebook properties

In [4]:
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
sns.set_style("darkgrid")

Set data path

In [5]:
DATA_PATH = r'../data_source'

In [6]:
training_set_features = pd.read_csv(os.path.join(DATA_PATH, 'training_set_features.csv'))
training_set_labels = pd.read_csv(os.path.join(DATA_PATH, 'training_set_labels.csv'))
test_set_features = pd.read_csv(os.path.join(DATA_PATH, 'test_set_features.csv'))

In [7]:
train_df = training_set_features.merge(training_set_labels, on=['respondent_id'], how='left')

#### Process Features

In [8]:
def process_features(df):
    
    cols_to_process =  ['h1n1_concern', 'h1n1_knowledge',
                        'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
                        'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
                        'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
                        'education', 'race', 'sex', 'income_poverty', 'marital_status',
                        'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
                        'household_adults', 'household_children', 'employment_industry',
                        'employment_occupation']
    
    for i in cols_to_process:
        df[i] = [f'{i}_' + str(x)  for x in df[i]]
        
    concat_list = []
    for i in cols_to_process:
        concat_list.append(pd.get_dummies(df[i]))
        
    one_hot_encoded = pd.concat(concat_list, axis=1)
    df = df.drop(columns=cols_to_process)
    df_concatenated = pd.concat([df, one_hot_encoded], axis=1)
        
    return df_concatenated

In [9]:
X = process_features(training_set_features).iloc[:,1:].fillna(0)
y_h1n1 = training_set_labels['h1n1_vaccine']
y_seasonal = training_set_labels['seasonal_vaccine']

In [10]:
X_test = process_features(test_set_features).iloc[:,1:].fillna(0)

In [11]:
def train_model(X,y):
    
    model = Sequential()
    model.add(Dense(25, input_dim=157, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))


    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X, y, epochs=20, batch_size=10, verbose=1)
    
    return model

In [12]:
def train_model2(X,y):
    
    model = Sequential()
    model.add(Dense(45, input_dim=157, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X, y, epochs=20, batch_size=10, verbose=1)
    
    return model

In [13]:
def train_model3(X,y):
    
    model = Sequential()
    model.add(Dense(30, input_dim=157, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))


    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X, y, epochs=20, batch_size=10, verbose=1)
    
    return model

In [None]:
def train_model3(X,y):
    
    model = Sequential()
    model.add(Dense(30, input_dim=157, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))


    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X, y, epochs=20, batch_size=10, verbose=1)
    
    return model

In [14]:
trained_model_h1n1 = train_model3(X, y_h1n1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [15]:
trained_model_seasonal = train_model3(X, y_seasonal)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [16]:
predictions_h1n1 = trained_model_h1n1.predict_classes(X_test)
probability_h1n1 = trained_model_h1n1.predict_proba(X_test).flatten()

In [17]:
predictions_seasonal = trained_model_seasonal.predict_classes(X_test)
probability_seasonal = trained_model_seasonal.predict_proba(X_test).flatten()

In [18]:
probability_seasonal

array([0.02190632, 0.0198873 , 0.8890126 , ..., 0.1791381 , 0.10175017,
       0.21224797], dtype=float32)

In [19]:
submission = pd.DataFrame()
submission['respondent_id'] = test_set_features['respondent_id']
submission['h1n1_vaccine'] = probability_h1n1
submission['seasonal_vaccine'] = probability_seasonal

In [22]:
outpath = os.path.join(r'../output', 'sub3.csv')

In [23]:
submission.to_csv(outpath, index=False)