In [1]:
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

# Import all candidates to support automatic decision making
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier

In [2]:
try:
    _ = first_run
except NameError:
    first_run = True
    os.chdir(os.getcwd().rsplit("/", 1)[0])
    from _aux import features as F

# Load Data

Now that we are ready to present our final results, we must load the test set which we held out at step "0_split_data".

In [3]:
X_train, y_train = joblib.load(
    "../data/train/preprocessed/undersampled_train_features_labels.joblib.gz"
)

X_test, y_test = pd.read_csv("../data/test/X_test.csv", index_col=0), pd.read_csv(
    "../data/test/y_test.csv", index_col=0
)

# Load the preprocessor and transform test data

In [4]:
preprocessor = joblib.load("../ml_artifacts/preprocessor.joblib.gz")

In [5]:
X_test_preproc = preprocessor.transform(X_test)

# Choose model based on GridSearch performance

In [6]:
best_model = joblib.load(
    "../ml_artifacts/gridsearch_results/param_search_result.joblib.gz"
).iloc[0]

# Fit the best model

In [7]:
model = eval(best_model.estimator)(**best_model.params).fit(X_train, y_train)

In [8]:
prediction = model.predict_proba(X_test_preproc)

threshold_perf = pd.DataFrame(
    [
        (
            threshold,
            *confusion_matrix(
                y_test, (prediction[:, 1] > threshold).astype(int)
            ).ravel(),
        )
        for threshold in np.arange(0.05, 1, 0.05)
    ],
    columns=["threshold", "tn", "fp", "fn", "tp"],
).assign(
    precision=lambda df: df["tp"] / (df["tp"] + df["fp"]),
    recall=lambda df: df["tp"] / (df["tp"] + df["fn"]),
    f1=lambda df: 2
    * (df["precision"] * df["recall"])
    / (df["precision"] + df["recall"]),
)

threshold_perf.to_csv("../ml_artifacts/model_performance.csv", index=False)

In [9]:
def highlight_max(data, color="yellow"):
    """
    highlight the maximum in a Series or DataFrame
    """
    attr = "background-color: {}".format(color)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else "" for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(
            np.where(is_max, attr, ""), index=data.index, columns=data.columns
        )


threshold_perf.style.apply(
    highlight_max, color="green", subset=["precision", "recall", "f1"]
)

Unnamed: 0,threshold,tn,fp,fn,tp,precision,recall,f1
0,0.05,3568,14170,0,258,0.017882,1.0,0.035136
1,0.1,4294,13444,1,257,0.018758,0.996124,0.036822
2,0.15,6231,11507,4,254,0.021597,0.984496,0.042266
3,0.2,6699,11039,4,254,0.022492,0.984496,0.043979
4,0.25,8210,9528,8,250,0.025568,0.968992,0.049821
5,0.3,10212,7526,12,246,0.031652,0.953488,0.06127
6,0.35,10929,6809,14,244,0.034595,0.945736,0.066749
7,0.4,11028,6710,14,244,0.035088,0.945736,0.067665
8,0.45,12744,4994,31,227,0.043478,0.879845,0.082862
9,0.5,13184,4554,39,219,0.045883,0.848837,0.08706


# Add model to pipeline and save it

In [10]:
pipeline = preprocessor.set_params(model=model)

joblib.dump(pipeline, "../ml_artifacts/pipeline.joblib.gz")

['../ml_artifacts/pipeline.joblib.gz']

# Batch predict required set

In [11]:
to_predict = pd.read_csv("../data/predict/to_predict.csv")

In [12]:
to_predict[["uuid"]].assign(
    pd=pipeline.predict_proba(to_predict.drop("default", axis=1))[:, 1]
).to_csv("../data/predict/predictions.csv", index=False)