In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

In [2]:
# importing the data

train = pd.read_csv('training_set_features.csv',index_col='respondent_id')
train_labels = pd.read_csv('training_set_labels.csv',index_col='respondent_id')
test = pd.read_csv('test_set_features.csv',index_col='respondent_id')

trainN = train[train.columns[train.dtypes != 'object']] # continuous
trainC = train[train.columns[train.dtypes == 'object']] # discrete



In [3]:
## categorical dataframe
# take care of NA's by replacing them with 'Not Reported' (NR) for now
trainC2 = trainC.fillna('NR')

#  get dummies/encode categorical variables
DtrainC = pd.get_dummies(trainC2,drop_first=True)

In [4]:
## expirement cell
## impute missing values
## center and scale numerical variables
# unfortunately this does not preserve column names

si = SimpleImputer(missing_values = np.nan,strategy = 'mean').fit(trainN)
si_trainN = pd.DataFrame(si.transform(trainN))


ss = StandardScaler().fit(si_trainN)
trainN_proc = pd.DataFrame(ss.transform(si_trainN),
                           columns = si_trainN.columns,
                          index = trainN.index)


In [5]:
# use the transformers from the training set on the test set, and encode categoricals

testN = test[test.columns[test.dtypes != 'object']] # continuous
testC = test[test.columns[test.dtypes == 'object']] # discrete

testC2 = testC.fillna('NR')
DtestC = pd.get_dummies(testC2,drop_first=True)

si_testN = pd.DataFrame(si.transform(testN))

testN_proc = pd.DataFrame(ss.fit_transform(si_testN),
                          columns = si_testN.columns,
                         index = testN.index)


In [7]:
#  combine the two datasets
DF = trainN_proc.join(DtrainC) 
DF_sub = testN_proc.join(DtestC) 

In [9]:
# split the data into training and test datasets

X_train, X_test, Y_train, Y_test = train_test_split(
    DF,
    train_labels,
    test_size=0.33,
    shuffle=True,
    stratify=train_labels,
)

In [10]:
# train a Logistic regression model on the training data
LR_model = MultiOutputClassifier(estimator = LogisticRegression(max_iter=200)).fit(X_train,Y_train)

# classification score.  The competition is judged using area-under-the-curve however (ROC AUC)
LR_model.score(X_test,Y_test)

0.6684819605173588

In [70]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedKFold

pipeline = Pipeline([

    ('clf', MultiOutputClassifier(LogisticRegression(max_iter=200)))
])

#parameters = [
#    {"clf": [LogisticRegression(max_iter=200)],
#    "clf__estimator__C": Cparam}
#]

parameters = {'clf__estimator__C': [.1,.2,.3,.4,.5,.6,.7,.8,.9,1]}

rkf = RepeatedKFold(
    n_splits=10,
    n_repeats=2,
    random_state=42
)

cv = GridSearchCV(
    pipeline,
    parameters,
    cv=rkf,
    scoring= 'roc_auc',
)

cv

GridSearchCV(cv=RepeatedKFold(n_repeats=2, n_splits=10, random_state=42),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('clf',
                                        MultiOutputClassifier(estimator=LogisticRegression(C=1.0,
                                                                                           class_weight=None,
                                                                                           dual=False,
                                                                                           fit_intercept=True,
                                                                                           intercept_scaling=1,
                                                                                           l1_ratio=None,
                                                                                           max_iter=200,
                                                           

In [71]:
cv.fit(X_train,Y_train)
cv.cv_results_

{'mean_fit_time': array([0.36383033, 0.43927499, 0.48185881, 0.52340492, 0.54838452,
        0.57950388, 0.61555352, 0.63619618, 0.67399802, 0.66856203]),
 'std_fit_time': array([0.02543078, 0.02967829, 0.03274918, 0.02415913, 0.03295768,
        0.03100751, 0.04434022, 0.04462313, 0.06304217, 0.03811829]),
 'mean_score_time': array([0.00583074, 0.00638261, 0.00593573, 0.00617795, 0.00638241,
        0.00642846, 0.0065818 , 0.00613493, 0.005984  , 0.00668093]),
 'std_score_time': array([0.00100672, 0.00106843, 0.0008662 , 0.00115743, 0.00110914,
        0.00127872, 0.0015889 , 0.00119274, 0.001001  , 0.00161245]),
 'param_clf__estimator__C': masked_array(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'params': [{'clf__estimator__C': 0.1},
  {'clf__estimator__C': 0.2},
  {'clf__estimator__C': 0.3},
  {'clf__estimator_

In [72]:
optoLR = cv.best_estimator_
optoLR.fit(X_train,Y_train)

preds = optoLR.predict_proba(X_test)

Y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": preds[0][:, 1],
        "seasonal_vaccine": preds[1][:, 1],
    },
    index = Y_test.index
)

roc_auc_score(Y_test,Y_preds)

0.8462071254305994

In [12]:
#  look at the average ROC area under the curve score
roc_auc_score(Y_test,Y_preds)

0.8460844224146622

In [73]:
#  make our submission model by training on the entire data set
#  make our predictions to be submitted

sub_preds = optoLR.predict_proba(DF_sub)

sub_preds = pd.DataFrame(
    {
        "h1n1_vaccine": sub_preds[0][:, 1],
        "seasonal_vaccine": sub_preds[1][:, 1],
    }
)

sub_preds

Unnamed: 0,h1n1_vaccine,seasonal_vaccine
0,0.074726,0.313458
1,0.020953,0.031925
2,0.473723,0.646697
3,0.480853,0.868863
4,0.223085,0.519225
...,...,...
26703,0.346225,0.483380
26704,0.045066,0.235377
26705,0.153685,0.203492
26706,0.052612,0.346455


In [74]:
#  look at the submission format
sub_form = pd.read_csv('submission_format.csv')
sub_form.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.5,0.7
1,26708,0.5,0.7
2,26709,0.5,0.7
3,26710,0.5,0.7
4,26711,0.5,0.7


In [75]:
#  insert the respondent id as a column
sub_preds.insert(loc=0,column='respondent_id',value=DF_sub.index)
sub_preds.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.074726,0.313458
1,26708,0.020953,0.031925
2,26709,0.473723,0.646697
3,26710,0.480853,0.868863
4,26711,0.223085,0.519225


In [76]:
sub_preds.to_csv('preds1_LR_MI_GS.csv',index=False)

preds1_LR == first submission (mode imputed) <br>
preds1_LR_MI == second submission (mean imputed)