In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, cross_val_score

import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
X_train = pd.read_csv('../data/X_train_flu.csv', index_col= 'respondent_id')
X_test = pd.read_csv('../data/X_test_flu.csv', index_col= 'respondent_id')
y_train = pd.read_csv('../data/y_train_flu.csv', index_col= 'respondent_id')
y_test = pd.read_csv('../data/y_test_flu.csv', index_col= 'respondent_id')

In [None]:
y_train = y_train['seasonal_vaccine']
y_test = y_test['seasonal_vaccine']

In [None]:
X_train.head()

In [None]:
X_train.isna().sum()

In [None]:
y_train.isna().sum()

In [None]:
X_train_cat = X_train.select_dtypes('object')
X_train_num = X_train.select_dtypes(['float64', 'int64'])

cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])
num_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
])

transformer = ColumnTransformer([('categorical', cat_pipe, X_train_cat.columns),
                                 ('numerical', num_pipe, X_train_num.columns)])

In [None]:
model_pipe = Pipeline(steps=[
    ('trans', transformer),
    ('rfc', RandomForestClassifier(verbose=1, n_jobs=-2))
])

model_pipe.fit(X_train, y_train)

In [None]:
rfc_cv_score = cross_val_score(model_pipe, X_train, y_train, n_jobs=-1, verbose=3, scoring = 'f1')

In [None]:
rfc_cv_score.mean()

# Random Forest Classifier

In [None]:
model_pipe_2 = Pipeline(steps=[
    ('trans', transformer),
    ('rfc', RandomForestClassifier(verbose=1, n_jobs=-2))
])

In [None]:
params = {
    'rfc__max_depth': list(range(10,100,10)),
    'rfc__criterion': ['gini', 'entropy'],
    'rfc__n_estimators': list(range(100,250,50)),
    'rfc__min_samples_leaf': list(range(2,10,2)),
    'rfc__min_samples_split': list(range(2,10,2)),
}

In [None]:
gs_rfc = GridSearchCV(model_pipe_2, params, n_jobs=-1, verbose=3, cv = 3, scoring = 'f1')
gs_rfc.fit(X_train, y_train)

In [None]:
gs_rfc.best_params_

In [None]:
rfc_cv_1 = cross_val_score(gs_rfc.best_estimator, X_train, y_train, n_jobs=-1, verbose=3, scoring = 'f1')

In [None]:
rfc_cv_1.mean()

## Gradiant Boost Classifier

In [None]:
model_pipe_3 = Pipeline(steps=[
    ('trans', transformer),
    ('gbc', GradientBoostingClassifier(verbose=3))
])

In [None]:
model_pipe_3.get_params()

In [None]:
params = {
    'gbc__learning_rate': [0.001, 0.01, 0.1, .5, .9],
    'gbc__n_estimators': list(range(100,250,50)),
    'gbc__min_samples_leaf': list(range(2,10,2)),
    'gbc__min_samples_split': list(range(2,10,2)),
    'gbc__max_features': list(range(0,200,50))
}

In [None]:
gs_gbc = GridSearchCV(model_pipe_3, params, n_jobs=-1, verbose=1, cv = 3, scoring = 'f1')
gs_gbc.fit(X_train, y_train)

In [None]:
gs_gbc.best_params_

In [None]:
gbc_cv_1 = cross_val_score(gs_gbc.best_estimator, X_train, y_train, n_jobs=-1, verbose=3, scoring = 'f1')

In [None]:
gbc_cv_1.mean()