In [1]:
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

# Import all candidates to support automatic decision making
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [2]:
try:
    _ = first_run
except NameError:
    first_run = True
    os.chdir(os.getcwd().rsplit("/", 1)[0])
    from _aux import features as F

# Load Data

Now that we are ready to present our final results, we must load the test set which we held out at step "0_split_data".

In [3]:
X_train, y_train = joblib.load("../data/train/preprocessed/undersampled_train_features_labels.joblib.gz")

X_test, y_test = pd.read_csv("../data/test/X_test.csv", index_col=0), pd.read_csv("../data/test/y_test.csv", index_col=0)

# Load the preprocessor and transform test data

In [4]:
preprocessor = joblib.load("../ml_artifacts/preprocessor.joblib.gz")

In [5]:
X_test_preproc = preprocessor.transform(X_test)

# Choose model based on GridSearch performance

In [6]:
best_model = joblib.load("../ml_artifacts/gridsearch_results/2021-05-23T14:13:24broad_param_search_result.joblib.gz").iloc[0]

# Fit the best model

In [7]:
model = eval(best_model.estimator)(**best_model.params).fit(X_train, y_train)

In [8]:
threshold_perf = pd.DataFrame(
    [
        (threshold, *confusion_matrix(y_test, (model.predict(X_test_preproc) > threshold).astype(int)).ravel())
        for threshold in np.arange(.05, 1, .05)
    ],
    columns=["threshold", "tn", "fp", "fn", "tp"]
).assign(
    precision=lambda df: df["tp"] / (df["tp"] + df["fp"]),
    recall=lambda df: df["tp"] / (df["tp"] + df["fn"]),
    f1=lambda df: 2 * (df["precision"] * df["recall"]) / (df["precision"] + df["recall"])
)

threshold_perf.to_csv("../ml_artifacts/model_performance.csv", index=False)

In [9]:
def highlight_max(data, color='yellow'):
    '''
    highlight the maximum in a Series or DataFrame
    '''
    attr = 'background-color: {}'.format(color)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)


threshold_perf.style.apply(
    highlight_max, color='green', subset=["precision", "recall", 'f1']
)

Unnamed: 0,threshold,tn,fp,fn,tp,precision,recall,f1
0,0.05,3695,14043,0,258,0.018041,1.0,0.035442
1,0.1,6009,11729,3,255,0.021278,0.988372,0.04166
2,0.15,6983,10755,4,254,0.023072,0.984496,0.045087
3,0.2,8481,9257,10,248,0.026092,0.96124,0.050804
4,0.25,9797,7941,12,246,0.030048,0.953488,0.058259
5,0.3,10740,6998,14,244,0.033692,0.945736,0.065067
6,0.35,11433,6305,18,240,0.036669,0.930233,0.070557
7,0.4,12117,5621,25,233,0.039802,0.903101,0.076243
8,0.45,12575,5163,36,222,0.041226,0.860465,0.078682
9,0.5,13069,4669,44,214,0.043826,0.829457,0.083252


# Add model to pipeline and save it

In [10]:
preprocessor.steps.append(("model", model))
joblib.dump(preprocessor, "../ml_artifacts/model.joblib")

['../ml_artifacts/model.joblib']