In [None]:
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

# Import all candidates to support automatic decision making
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier

In [None]:
try:
    _ = first_run
except NameError:
    first_run = True
    os.chdir(os.getcwd().rsplit("/", 1)[0])
    from _aux import features as F

# Load Data

Now that we are ready to present our final results, we must load the test set which we held out at step "0_split_data".

In [None]:
X_train, y_train = joblib.load(
    "../data/train/preprocessed/undersampled_train_features_labels.joblib.gz"
)

X_test, y_test = pd.read_csv("../data/test/X_test.csv", index_col=0), pd.read_csv(
    "../data/test/y_test.csv", index_col=0
)

# Load the preprocessor and transform test data

In [None]:
preprocessor = joblib.load("../ml_artifacts/preprocessor.joblib.gz")

In [None]:
X_test_preproc = preprocessor.transform(X_test)

# Choose model based on GridSearch performance

In [None]:
best_model = joblib.load(
    "../ml_artifacts/gridsearch_results/param_search_result.joblib.gz"
).iloc[0]

# Fit the best model

In [None]:
model = eval(best_model.estimator)(**best_model.params).fit(X_train, y_train)

In [None]:
prediction = model.predict_proba(X_test_preproc)

threshold_perf = pd.DataFrame(
    [
        (
            threshold,
            *confusion_matrix(
                y_test, (prediction[:, 1] > threshold).astype(int)
            ).ravel(),
        )
        for threshold in np.arange(0.05, 1, 0.05)
    ],
    columns=["threshold", "tn", "fp", "fn", "tp"],
).assign(
    precision=lambda df: df["tp"] / (df["tp"] + df["fp"]),
    recall=lambda df: df["tp"] / (df["tp"] + df["fn"]),
    f1=lambda df: 2
    * (df["precision"] * df["recall"])
    / (df["precision"] + df["recall"]),
)

threshold_perf.to_csv("../ml_artifacts/model_performance.csv", index=False)

In [None]:
def highlight_max(data, color="yellow"):
    """
    highlight the maximum in a Series or DataFrame
    """
    attr = "background-color: {}".format(color)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else "" for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(
            np.where(is_max, attr, ""), index=data.index, columns=data.columns
        )


threshold_perf.style.apply(
    highlight_max, color="green", subset=["precision", "recall", "f1"]
)

# Add model to pipeline and save it

In [None]:
pipeline = preprocessor.set_params(model=model)

joblib.dump(pipeline, "../ml_artifacts/pipeline.joblib.gz")

# Batch predict required set

In [None]:
to_predict = pd.read_csv("../data/predict/to_predict.csv")

In [None]:
to_predict[["uuid"]].assign(
    pd=pipeline.predict_proba(to_predict.drop("default", axis=1))[:, 1]
).to_csv("../data/predict/predictions.csv", index=False)