In [None]:
%matplotlib inline
%load_ext autoreload

import warnings
warnings.filterwarnings("ignore") # disable warnings

from os import getcwd
from os.path import join, abspath, pardir
import numpy as np
import pandas as pd

import pickle

# sklearn libraries

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, make_scorer

from sklearn.experimental import enable_iterative_imputer # enable experimental imputer
from sklearn.impute import IterativeImputer               # sample imputation
from sklearn import preprocessing                         # encoders, transformations
from sklearn.model_selection import cross_validate        # cross-validation, model evaluation
from sklearn.model_selection import GridSearchCV          # hyper-parameter tuning
from sklearn.linear_model import LogisticRegression       # logistic regression model
from sklearn.svm import SVC                               # support vector machine model
from sklearn.neighbors import KNeighborsClassifier        # k-nearest neighbours model
from sklearn.ensemble import GradientBoostingClassifier   # gradient boosting model
from sklearn.ensemble import VotingClassifier             # voting ensemble model
from sklearn.ensemble import StackingClassifier           # stacking ensemble model

# IPython
from IPython.core.interactiveshell import InteractiveShell

##### Config settings

In [None]:
parent_dir = abspath(join(join(getcwd(), pardir), pardir))
data_dir = join(parent_dir, "data")
model_dir = join(parent_dir, "models")
data_file = join(data_dir, "preprocessed.csv")

# For IPython

InteractiveShell.ast_node_interactivity = "all" # To show all output after each cell execution (instead of the last output)

# For pandas

pd.options.display.max_columns = 200 # display upto 200 columns (instead of default 20)
pd.options.display.max_rows = 200 # display upto 200 rows (instead of default 60)

# random state
__random_state = 0

#### Helper functions

In [None]:
def save_model(model, file_path: str) -> None:
    """
    Save model as a pickle file
    """
    with open(file_path, "wb") as file:
        pickle.dump(model, file)

def load_model(file_path: str):
    """
    Load model from a pickle file
    """
    with open(file_path, "rb") as file:
        return pickle.load(file)

def dataframe_to_csv(df: pd.DataFrame, file_path: str) -> None:
    """
    Save dataframe as .csv file
    """
    df.to_csv(file_path, index=False)

def get_best_clf(clf, param_grid, X, y, **kwargs):
    """
    Grid Search with stratified splitting and other parameters

    Returns best estimator and it's score (F1 score)
    """

    # f1 score rather then accuracy
    f1 = make_scorer(f1_score, average='micro')

    # stratified split
    split_count = 10
    kf = StratifiedKFold(n_splits=split_count, random_state=__random_state, shuffle=True)

    grid_search = GridSearchCV(
        estimator=clf,
        param_grid=param_grid,
        scoring=f1,
        cv=kf,
        n_jobs=-1,
        **kwargs
    )

    grid_search.fit(X, y)

    return grid_search
    #return grid_search.best_score_, grid_search.best_estimator_

#### Load preprocessed data

In [None]:
df = pd.read_csv(data_file, encoding= 'ISO-8859-1')
df.head()

## Modelling

In [None]:
# define feature and target variables
features, target = df, df['dec_o']
features.drop(['dec_o'], axis=1, inplace=True)

### 1. Baseline Models

#### 1.1. [Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [None]:
parameters = {
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'C': np.logspace(-4, 4, 20),
    'max_iter': [10000]
}

classifier_lr = get_best_clf(
    clf=LogisticRegression(random_state=__random_state),
    X=features,
    y=target,
    param_grid=parameters,
    verbose=2
)

classifier_lr.best_params_

In [None]:
classifier_lr.best_estimator_

In [None]:
clf_logistic_regression = LogisticRegression(
    random_state=__random_state,
    penalty=classifier_lr.best_params_['penalty'],
    solver=classifier_lr.best_params_['solver'],
    C=classifier_lr.best_params_['C'],
    max_iter=classifier_lr.best_params_['max_iter']
)
clf_logistic_regression

In [None]:
save_model(clf_logistic_regression, join(model_dir, "clf_logistic_regression.pkl"))

#### 1.2. [SVC](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)

In [None]:
parameters = {
    'kernel': ['rbf'],
    'gamma': [1e-4, 1e-3, 1e-2],
    'C': [1, 10, 100, 1000]
}

classifier_sv = get_best_clf(
    clf=SVC(random_state=__random_state),
    X=features,
    y=target,
    param_grid=parameters,
    verbose=2
)

classifier_sv.best_params_

In [None]:
clf_svc = SVC(
    random_state=__random_state,
    kernel=classifier_sv.best_params_['kernel'],
    gamma=classifier_sv.best_params_['gamma'],
    C=classifier_sv.best_params_['C']
)
clf_svc

In [None]:
save_model(clf_svc, join(model_dir, "clf_svc.pkl"))

#### 1.3. [KNN](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)

In [None]:
parameters = {
    'n_neighbors': [5, 11, 19, 29],
    'weights': ['uniform', 'distance'],
    'metric': ['minkowski', 'euclidean', 'manhattan']
}

classifier_kn = get_best_clf(
    clf=KNeighborsClassifier(),
    X=features,
    y=target,
    param_grid=parameters,
    verbose=2
)

classifier_kn.best_params_

In [None]:
clf_knn = KNeighborsClassifier(
    n_neighbors=classifier_kn.best_params_['n_neighbors'],
    weights=classifier_kn.best_params_['weights'],
    metric=classifier_kn.best_params_['metric']
)
clf_knn

In [None]:
save_model(clf_knn, join(model_dir, "clf_knn.pkl"))

### 2. [Ensemble models](https://scikit-learn.org/stable/modules/ensemble.html#ensemble)

#### 2.1. [Gradient Boost](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html)

In [None]:
parameters = {
    'loss': ['deviance', 'exponential'],
    'learning_rate': [0.05],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'max_features': ['sqrt', 'log2']
}

classifier_gb = get_best_clf(
    clf=GradientBoostingClassifier(random_state=__random_state),
    X=features,
    y=target,
    param_grid=parameters,
    verbose=2
)

classifier_gb.best_params_

In [None]:
clf_gb = GradientBoostingClassifier(
    random_state=__random_state,
    loss=classifier_gb.best_params_['loss'],
    learning_rate=classifier_gb.best_params_['learning_rate'],
    n_estimators=classifier_gb.best_params_['n_estimators'],
    max_depth=classifier_gb.best_params_['max_depth'],
    max_features=classifier_gb.best_params_['max_features']
)
clf_gb

In [None]:
save_model(clf_gb, join(model_dir, "clf_gb.pkl"))

#### 2.2. [Voting Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html)

Let's combine all classifiers and train a voting model

In [None]:
estimators = [
    ('lr', clf_logistic_regression), # logistic regression
    ('sv', clf_svc), # svc
    ('kn', clf_knn), # knn
    ('gb', clf_gb) # gradient boosting
]

# voting classifier
clf_voting = VotingClassifier(
    estimators=estimators,
    voting='hard'
)
clf_voting

In [None]:
save_model(clf_voting, join(model_dir, "clf_voting.pkl"))

#### 2.3. [Stacking Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingClassifier.html#sklearn.ensemble.StackingClassifier)

In [None]:
# stacking classifier
clf_stacking = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression()
)
clf_stacking

In [None]:
save_model(clf_stacking, join(model_dir, "clf_stacking.pkl"))