In [None]:
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [None]:
try:
    _ = first_run
except NameError:
    first_run = True
    os.chdir(os.getcwd().rsplit("/", 1)[0])

# Load Data

In [None]:
X_train, y_train = joblib.load("../data/train/preprocessed/train_features_labels.joblib.gz")

X_validation, y_validation = joblib.load("../data/train/preprocessed/validation_features_labels.joblib.gz")

# Define baseline model

In [None]:
baseline = RandomForestClassifier().fit(X_train, y_train)

_ = joblib.dump(baseline, "../ml_artifacts/baseline_model.joblib.gz")

In [None]:
prediction = baseline.predict_proba(X_validation)

threshold_perf = pd.DataFrame(
    [
        (threshold, *confusion_matrix(y_validation, (prediction[:, 1] > threshold).astype(int)).ravel())
        for threshold in np.arange(.05, .95, .05)
    ],
    columns=["threshold", "tn", "fp", "fn", "tp"]
).assign(
    precision=lambda df: df["tp"] / (df["tp"] + df["fp"]),
    recall=lambda df: df["tp"] / (df["tp"] + df["fn"]),
    f1=lambda df: 2 * (df["precision"] * df["recall"]) / (df["precision"] + df["recall"])
)

threshold_perf.to_csv("../ml_artifacts/baseline_model_performance.csv", index=False)

In [None]:
def highlight_max(data, color='yellow'):
    '''
    highlight the maximum in a Series or DataFrame
    '''
    attr = 'background-color: {}'.format(color)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)


threshold_perf.style.apply(
    highlight_max, color='darkorange', subset=["precision", "recall", 'f1']
)