In [1]:
# imports
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline

In [2]:
X_train = pd.read_csv('../data/X_train_flu.csv', index_col= 'respondent_id')
X_test = pd.read_csv('../data/X_test_flu.csv', index_col= 'respondent_id')
y_train = pd.read_csv('../data/y_train_flu.csv', index_col= 'respondent_id')
y_test = pd.read_csv('../data/y_test_flu.csv', index_col= 'respondent_id')

In [3]:
y_train = y_train['seasonal_vaccine']
y_test = y_test['seasonal_vaccine']

In [4]:
X_train.head()

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7273,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,Other or Multiple,Female,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,qufhixun,"MSA, Not Principle City",1.0,0.0
13773,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,White,Female,,Not Married,Own,Employed,lrircsnp,Non-MSA,0.0,0.0
24162,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,White,Female,"> $75,000",Married,Own,Employed,bhuqouqj,"MSA, Not Principle City",1.0,0.0
17341,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,Black,Female,"<= $75,000, Above Poverty",Not Married,Own,Employed,fpwskwrf,"MSA, Principle City",1.0,2.0
23521,2.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,White,Male,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,lzgpxyit,"MSA, Not Principle City",1.0,0.0


In [5]:
X_train.isna().sum()

h1n1_concern                     69
h1n1_knowledge                   85
behavioral_antiviral_meds        51
behavioral_avoidance            155
behavioral_face_mask             14
behavioral_wash_hands            30
behavioral_large_gatherings      55
behavioral_outside_home          55
behavioral_touch_face            90
doctor_recc_h1n1               1605
doctor_recc_seasonal           1605
chronic_med_condition           745
child_under_6_months            617
health_worker                   606
opinion_h1n1_vacc_effective     306
opinion_h1n1_risk               290
opinion_h1n1_sick_from_vacc     293
opinion_seas_vacc_effective     350
opinion_seas_risk               394
opinion_seas_sick_from_vacc     414
age_group                         0
education                      1048
race                              0
sex                               0
income_poverty                 3261
marital_status                 1053
rent_or_own                    1518
employment_status           

In [6]:
y_train.isna().sum()

0

In [7]:
X_train_cat = X_train.select_dtypes('object')
X_train_num = X_train.select_dtypes(['float64', 'int64'])

cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])
num_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent'))
])

transformer = ColumnTransformer([('categorical', cat_pipe, X_train_cat.columns),
                                 ('numerical', num_pipe, X_train_num.columns)])

In [8]:
logreg_pipe = Pipeline([
    ('trans', transformer),
    ('logreg', LogisticRegression(n_jobs=-1, random_state=42))
])
logreg_pipe.fit(X_train, y_train)

Pipeline(steps=[('trans',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region',
       'census_msa'],
      dtype=...
       'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
   

In [9]:
logreg_cv = cross_val_score(logreg_pipe, X_train, y_train, n_jobs=-1, verbose=3)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.6s remaining:    4.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.7s finished


In [10]:
logreg_cv.mean()

0.7760359460808786

# Logistic Regression with Grid Search

In [11]:
logreg_pipe_grid = {'logreg__C': [1e-2, 1, 1e2]}
gs_logreg_pipe = GridSearchCV(estimator=logreg_pipe, param_grid=logreg_pipe_grid,
                              cv=5, verbose=3, n_jobs=-1)
gs_logreg_pipe.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('trans',
                                        ColumnTransformer(transformers=[('categorical',
                                                                         Pipeline(steps=[('impute',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_...
       'chronic_med_condition', 'child_under_6_mont

In [12]:
gs_logreg_pipe.best_params_

{'logreg__C': 100.0}

In [13]:
log_cv_f1 = cross_val_score(gs_logreg_pipe.best_estimator_, X_train, y_train, n_jobs=-1, verbose=3) 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.5s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished


In [14]:
log_cv_f1.mean()

0.7762356465302046

# KNN

In [15]:
knn_pipe = Pipeline([
    ('trans', transformer),
    ('knn', KNeighborsClassifier(n_jobs=-1))
])
knn_pipe.fit(X_train, y_train)

Pipeline(steps=[('trans',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region',
       'census_msa'],
      dtype=...
       'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
   

In [16]:
knn_pipe_cv = cross_val_score(knn_pipe, X_train, y_train, n_jobs=-1, verbose=3)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    6.8s remaining:   10.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.1s finished


In [17]:
knn_pipe_cv.mean()

0.7230654018971543

## KNN with a Grid Search

In [18]:
knn_pipe_grid = {'knn__n_neighbors': [5, 7, 9], 'knn__p': [1, 2, 3]}
gs_knn_pipe = GridSearchCV(
    estimator=knn_pipe, param_grid=knn_pipe_grid, cv=5, n_jobs=-1, verbose=3)
gs_knn_pipe.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


exception calling callback for <Future at 0x23d19390910 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "C:\Users\jkahl\anaconda3\envs\learn-env\lib\site-packages\joblib\externals\loky\_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "C:\Users\jkahl\anaconda3\envs\learn-env\lib\site-packages\joblib\parallel.py", line 359, in __call__
    self.parallel.dispatch_next()
  File "C:\Users\jkahl\anaconda3\envs\learn-env\lib\site-packages\joblib\parallel.py", line 792, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "C:\Users\jkahl\anaconda3\envs\learn-env\lib\site-packages\joblib\parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\jkahl\anaconda3\envs\learn-env\lib\site-packages\joblib\parallel.py", line 777, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\Users\jkahl\anaconda3\envs\learn-env\lib\site-packages\joblib\_parallel_bac

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.


In [None]:
gs_knn_pipe.best_params_

In [None]:
knn_pipe_cv = cross_val_score(gs_knn_pipe.best_estimator_, X_train, y_train, n_jobs=-1, verbose=3)

In [None]:
knn_pipe_cv.mean()

# Support Vector Classifier

In [None]:
svm_pipe = Pipeline([
    ('trans', transformer), 
    ('svm', SVC())
])
svm_pipe.fit(X_train, y_train)

In [None]:
svc_cv_score = cross_val_score(svm_pipe, X_train, y_train, n_jobs = -1, verbose = 3)

In [None]:
svc_cv_score.mean()

# SVC with a Grid Search

In [None]:
svm_pipe_grid = {'svm__C': [.001, 1, 100, 1000],
                 'svm__gamma': [0.001, 1, 100]}

In [None]:
gs_svm_pipe = GridSearchCV(estimator=svm_pipe, param_grid=svm_pipe_grid, verbose=1, cv=5, n_jobs=-1)
gs_svm_pipe.fit(X_train, y_train)

In [None]:
gs_svm_pipe.best_params_

In [None]:
svc_cv_score = cross_val_score(gs_svm_pipe.best_estimator_, X_train, y_train, n_jobs = -1, verbose = 3)

In [None]:
svc_cv_score.mean()