In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sys
import os

In [3]:
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

In [4]:
from keras.models import Sequential
from keras.layers import Dense

Set notebook properties

In [5]:
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
sns.set_style("darkgrid")

Set data path

In [6]:
DATA_PATH = r'../data_source'

In [7]:
training_set_features = pd.read_csv(os.path.join(DATA_PATH, 'training_set_features.csv'))
training_set_labels = pd.read_csv(os.path.join(DATA_PATH, 'training_set_labels.csv'))
test_set_features = pd.read_csv(os.path.join(DATA_PATH, 'test_set_features.csv'))

In [8]:
train_df = training_set_features.merge(training_set_labels, on=['respondent_id'], how='left')

#### Process Features

In [9]:
def process_features(df):
    
    cols_to_process =  ['h1n1_concern', 'h1n1_knowledge',
                        'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
                        'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
                        'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
                        'education', 'race', 'sex', 'income_poverty', 'marital_status',
                        'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
                        'household_adults', 'household_children', 'employment_industry',
                        'employment_occupation']
    
    for i in cols_to_process:
        df[i] = [f'{i}_' + str(x)  for x in df[i]]
        
    concat_list = []
    for i in cols_to_process:
        concat_list.append(pd.get_dummies(df[i]))
        
    one_hot_encoded = pd.concat(concat_list, axis=1)
    df = df.drop(columns=cols_to_process)
    df_concatenated = pd.concat([df, one_hot_encoded], axis=1)
        
    return df_concatenated

In [10]:
X = process_features(training_set_features).iloc[:,1:].fillna(0)
y_h1n1 = training_set_labels['h1n1_vaccine']
y_seasonal = training_set_labels['seasonal_vaccine']

In [11]:
X_test = process_features(test_set_features).iloc[:,1:].fillna(0)

In [12]:
def train_model(X,y):
    
    model = Sequential()
    model.add(Dense(25, input_dim=157, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))


    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X, y, epochs=20, batch_size=10, verbose=1)
    
    return model

In [13]:
def train_model2(X,y):
    
    model = Sequential()
    model.add(Dense(45, input_dim=157, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X, y, epochs=20, batch_size=10, verbose=1)
    
    return model

In [14]:
def train_model3(X,y):
    
    model = Sequential()
    model.add(Dense(30, input_dim=157, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))


    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X, y, epochs=20, batch_size=10, verbose=1)
    
    return model

In [15]:
def train_model3(X,y):
    
    model = Sequential()
    model.add(Dense(30, input_dim=157, activation='relu'))
    model.add(Dense(10))
    model.add(Dense(30))
    model.add(Dense(10))
    model.add(Dense(30))
    model.add(Dense(1, activation='sigmoid'))


    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X, y, epochs=20, batch_size=10, verbose=1)
    
    return model

In [16]:
def train_model4(X,y):
    
    model = Sequential()
    model.add(Dense(30, input_dim=157, activation='linear'))
    model.add(Dense(1, activation='sigmoid'))


    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X, y, epochs=20, batch_size=10, verbose=1)
    
    return model

In [21]:
def hyperParameterTuning(X_train, y_train):
    param_tuning = {
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7, 10],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.5, 0.7],
        'colsample_bytree': [0.5, 0.7],
        'n_estimators' : [100, 200, 500],
        'objective': ['binary:logistic']
    }
    X_train.columns = ['col_' + str(x) for x in range(len(X_train.columns))]

    xgb_model = XGBClassifier()

    gsearch = GridSearchCV(estimator = xgb_model,
                           param_grid = param_tuning,                        
                           cv = 5,
                           n_jobs = -1,
                           verbose = 1)

    gsearch.fit(X_train,y_train)

    return gsearch.best_params_

In [None]:
hyperParameterTuning(X, y_seasonal)

In [17]:
trained_model_h1n1 = train_model5(X, y_h1n1)



In [18]:
trained_model_seasonal = train_model5(X, y_seasonal)



In [26]:
# predictions_h1n1 = trained_model_h1n1.predict_classes(X_test)
X_test.columns = ['col_' + str(x) for x in range(len(X_test.columns))]
probability_h1n1 = trained_model_h1n1.predict_proba(X_test)
probability_h1n1 = [x[1] for x in probability_h1n1]

In [27]:
# predictions_seasonal = trained_model_seasonal.predict_classes(X_test)
X_test.columns = ['col_' + str(x) for x in range(len(X_test.columns))]
probability_seasonal = trained_model_seasonal.predict_proba(X_test)
probability_seasonal = [x[1] for x in probability_seasonal]

In [28]:
submission = pd.DataFrame()
submission['respondent_id'] = test_set_features['respondent_id']
submission['h1n1_vaccine'] = probability_h1n1
submission['seasonal_vaccine'] = probability_seasonal

In [29]:
outpath = os.path.join(r'../output', 'sub7.csv')

In [30]:
submission.to_csv(outpath, index=False)