In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
# importing the data

train = pd.read_csv('training_set_features.csv',index_col='respondent_id')
train_labels = pd.read_csv('training_set_labels.csv',index_col='respondent_id')
test = pd.read_csv('test_set_features.csv',index_col='respondent_id')

trainN = train[train.columns[train.dtypes != 'object']] # continuous
trainC = train[train.columns[train.dtypes == 'object']] # discrete



In [3]:
## categorical dataframe
# take care of NA's by replacing them with 'Not Reported' (NR) for now
trainC2 = trainC.fillna('NR')

#  get dummies/encode categorical variables
DtrainC = pd.get_dummies(trainC2,drop_first=True)

In [4]:
## expirement cell
## impute missing values
## center and scale numerical variables
# unfortunately this does not preserve column names

si = SimpleImputer(missing_values = np.nan,strategy = 'mean').fit(trainN)
si_trainN = pd.DataFrame(si.transform(trainN))


ss = StandardScaler().fit(si_trainN)
trainN_proc = pd.DataFrame(ss.transform(si_trainN),
                           columns = si_trainN.columns,
                          index = trainN.index)


In [5]:
# use the transformers from the training set on the test set, and encode categoricals

testN = test[test.columns[test.dtypes != 'object']] # continuous
testC = test[test.columns[test.dtypes == 'object']] # discrete

testC2 = testC.fillna('NR')
DtestC = pd.get_dummies(testC2,drop_first=True)

si_testN = pd.DataFrame(si.transform(testN))

testN_proc = pd.DataFrame(ss.fit_transform(si_testN),
                          columns = si_testN.columns,
                         index = testN.index)


In [6]:
DtestC.head()

Unnamed: 0_level_0,age_group_35 - 44 Years,age_group_45 - 54 Years,age_group_55 - 64 Years,age_group_65+ Years,education_< 12 Years,education_College Graduate,education_NR,education_Some College,race_Hispanic,race_Other or Multiple,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26707,1,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
26708,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
26709,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26710,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26711,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
#  combine the two datasets
DF = trainN_proc.join(DtrainC) 
DF_sub = testN_proc.join(DtestC) 

In [8]:
#  quick check for any NA values
findNA = lambda x: x.isnull().values.sum()
DF_sub.apply(findNA).sum()

0

In [9]:
# split the data into training and test datasets

X_train, X_test, Y_train, Y_test = train_test_split(
    DF,
    train_labels,
    test_size=0.33,
    shuffle=True,
    stratify=train_labels,
)

In [10]:
# train a Logistic regression model on the training data
LR_model = MultiOutputClassifier(estimator = LogisticRegression(max_iter=200)).fit(X_train,Y_train)

# classification score.  The competition is judged using area-under-the-curve however (ROC AUC)
LR_model.score(X_test,Y_test)

0.6684819605173588

In [11]:
## Create our predictions.  This outputs probability of a 1 (getting a vaccine)

preds = LR_model.predict_proba(X_test)

Y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": preds[0][:, 1],
        "seasonal_vaccine": preds[1][:, 1],
    },
    index = Y_test.index
)
Y_preds

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
8422,0.080938,0.470871
7722,0.475598,0.641839
995,0.414504,0.734935
15196,0.054465,0.813987
18358,0.025077,0.224538
...,...,...
3234,0.144830,0.495289
8107,0.713537,0.901696
18524,0.017086,0.769148
2869,0.302671,0.647071


In [12]:
#  look at the average ROC area under the curve score
roc_auc_score(Y_test,Y_preds)

0.8460844224146622

In [13]:
#  make our submission model by training on the entire data set
#  make our predictions to be submitted

sub_model = MultiOutputClassifier(estimator = LogisticRegression(max_iter=200)).fit(DF,train_labels)

sub_preds = sub_model.predict_proba(DF_sub)

sub_preds = pd.DataFrame(
    {
        "h1n1_vaccine": sub_preds[0][:, 1],
        "seasonal_vaccine": sub_preds[1][:, 1],
    }
)

sub_preds

Unnamed: 0,h1n1_vaccine,seasonal_vaccine
0,0.064991,0.303810
1,0.019393,0.036234
2,0.429569,0.589411
3,0.484665,0.872129
4,0.180902,0.492491
...,...,...
26703,0.307400,0.443183
26704,0.045538,0.253502
26705,0.159531,0.209074
26706,0.055971,0.347924


In [14]:
#  look at the submission format
sub_form = pd.read_csv('submission_format.csv')
sub_form.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.5,0.7
1,26708,0.5,0.7
2,26709,0.5,0.7
3,26710,0.5,0.7
4,26711,0.5,0.7


In [15]:
#  insert the respondent id as a column
sub_preds.insert(loc=0,column='respondent_id',value=DF_sub.index)
sub_preds.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.064991,0.30381
1,26708,0.019393,0.036234
2,26709,0.429569,0.589411
3,26710,0.484665,0.872129
4,26711,0.180902,0.492491


In [16]:
sub_preds.to_csv('preds1_LR_MI.csv',index=False)

preds1_LR == first submission (mode imputed) <br>
preds1_LR_MI == second submission (mean imputed)