# Model comparison

We have now trained our "best" baseline Regression (length of stay) and Classification (risk 1-5) models.

We can create an equivalent risk model from the risk categories using the predicted Length of Stay:

Risk Category|Day Range for Risk Category
-----|------
1 - Very low risk|0-6
2 - Low risk|7-10
3 - Normal risk|11-13
4 - Elevated risk|14-15
5 - High risk|>15

We can now compare the best regression model (**catboost**), cast as a classification, against the best classification model (**catboost**).

Inputs|Outputs
---|---
`processed/features-catboost.parquet`|&nbsp;
`models/regression.pickle`|&nbsp;
`models/classification.pickle`|&nbsp;

In [None]:
import math
import pickle
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn import preprocessing
from sklearn.metrics import f1_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

sys.path.append("../src/")

from utils import risk_score, train_test_validate_split

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

%matplotlib inline
plt.rcParams["figure.figsize"] = [15, 8]

## Load data

In [None]:
features_catboost_df = pd.read_parquet("../../data/processed/features-catboost.parquet")

# add actual risk scores
risk_labels = [
    "1 - Very Low Risk",
    "2 - Low Risk",
    "3 - Normal Risk",
    "4 - Elevated Risk",
    "5 - High Risk",
]
features_catboost_df["risk"] = [
    risk_score(los) for los in features_catboost_df.LENGTH_OF_STAY
]

# non-one-hot encoded data for catboost
X_catboost = features_catboost_df.drop(columns=["LENGTH_OF_STAY"])
y_catboost_reg = features_catboost_df["LENGTH_OF_STAY"]
y_catboost_clf = features_catboost_df["risk"]

# Split data for train/validate+test - regression catboost
(
    X_train_catboost_reg,
    X_validate_catboost_reg,
    X_test_catboost_reg,
    y_train_catboost_reg,
    y_validate_catboost_reg,
    y_test_catboost_reg,
) = train_test_validate_split(
    X_catboost,
    y_catboost_reg,
    train_size=0.70,
    validate_size=0.15,
    test_size=0.15,
    random_state=42,
)

# Split data for train/validate+test - classification catboost

(
    X_train_catboost_clf,
    X_validate_catboost_clf,
    X_test_catboost_clf,
    y_train_catboost_clf,
    y_validate_catboost_clf,
    y_test_catboost_clf,
) = train_test_validate_split(
    X_catboost,
    y_catboost_clf,
    train_size=0.70,
    validate_size=0.15,
    test_size=0.15,
    random_state=42,
)

## Load models

In [None]:
# load models from outside the git tree
with open("../../models/regression.pickle", "rb") as handle:
    models_regression = pickle.load(handle)
models_regression

In [None]:
# load models from outside the git tree
with open("../../models/classification.pickle", "rb") as handle:
    models_classification = pickle.load(handle)
models_classification

## Compare predicted risk score (classification) with equivalent-predicted risk score (regression)

Classification -> Risk score

Regression -> Length Of Stay -> Equivalent risk score

In [None]:
best_model = "catboost"

# setup a subplot figure
fig, axs = plt.subplots(1, 2)
fig.set_size_inches(15, 7)

# perform inference
preds_regression = np.clip(
    models_regression["final_model"][best_model]["model"].predict(X_test_catboost_reg),
    0,
    None,
)
preds_classification = models_classification["final_model"][best_model][
    "model"
].predict(X_test_catboost_clf)

# calculate performance metrics
rmse = mean_squared_error(y_test_catboost_reg, preds_regression, squared=False)
mae = mean_absolute_error(y_test_catboost_reg, preds_regression)

f1_score_weighted = f1_score(
    y_test_catboost_clf, preds_classification, average="weighted"
)

# create a prediction dataframe
predictions_df = pd.DataFrame(data=y_test_catboost_clf.reset_index(drop=True))
predictions_df["pred_regression_los"] = preds_regression
# calculate equivalent risk score from regression model
predictions_df["pred_regression"] = [
    risk_score(los) for los in predictions_df.pred_regression_los
]
predictions_df["pred_classification"] = preds_classification

#### Predicted vs Actual ####

# plot predicted vs actual CLASSES for classification
risks = dict.fromkeys(risk_labels)
for proportion in risks:
    risks[proportion] = np.array([0.0, 0.0, 0.0, 0.0, 0.0])

    for label in risk_labels:
        this_risk = int(label[0])

        # extract the predicted risk
        subset = predictions_df[predictions_df.pred_classification == this_risk]

        if proportion == "1 - Very Low Risk":
            count = (subset.risk == 1).sum()
        elif proportion == "2 - Low Risk":
            count = (subset.risk == 2).sum()
        elif proportion == "3 - Normal Risk":
            count = (subset.risk == 3).sum()
        elif proportion == "4 - Elevated Risk":
            count = (subset.risk == 4).sum()
        else:
            count = (subset.risk == 5).sum()

        prop = 0 if count == 0 else count / subset.shape[0]

        risks[proportion][this_risk - 1] = prop

bottom = np.array([0.0, 0.0, 0.0, 0.0, 0.0])
for proportion in risks:
    if proportion == "1 - Very Low Risk":
        data = risks[proportion]
        axs[0].bar(risk_labels, data, label=proportion, width=0.35)
    else:
        bottom += data
        data = risks[proportion]
        axs[0].bar(risk_labels, data, label=proportion, bottom=bottom, width=0.35)

axs[0].set_xlabel("Predicted risk")
axs[0].set_ylabel("Actual risk proportion")
axs[0].set_title(
    f"classification: {best_model} - f1 weighted: {f1_score_weighted.round(2)}"
)

# plot actual vs predicted CLASSES for regression
risks = dict.fromkeys(risk_labels)
for proportion in risks:
    risks[proportion] = np.array([0.0, 0.0, 0.0, 0.0, 0.0])

    for label in risk_labels:
        this_risk = int(label[0])

        # extract the predicted risk
        subset = predictions_df[predictions_df.pred_regression == this_risk]

        if proportion == "1 - Very Low Risk":
            count = (subset.risk == 1).sum()
        elif proportion == "2 - Low Risk":
            count = (subset.risk == 2).sum()
        elif proportion == "3 - Normal Risk":
            count = (subset.risk == 3).sum()
        elif proportion == "4 - Elevated Risk":
            count = (subset.risk == 4).sum()
        else:
            count = (subset.risk == 5).sum()

        prop = 0 if count == 0 else count / subset.shape[0]

        risks[proportion][this_risk - 1] = prop

bottom = np.array([0.0, 0.0, 0.0, 0.0, 0.0])
for proportion in risks:
    if proportion == "1 - Very Low Risk":
        data = risks[proportion]
        axs[1].bar(risk_labels, data, label=proportion, width=0.35)
    else:
        bottom += data
        data = risks[proportion]
        axs[1].bar(risk_labels, data, label=proportion, bottom=bottom, width=0.35)
handles, labels = axs[1].get_legend_handles_labels()
axs[1].legend(handles[::-1], labels[::-1], bbox_to_anchor=(1.05, 1))
axs[1].set_xlabel("Predicted risk")
axs[1].set_ylabel("Actual risk proportion")
axs[1].set_title(
    f"regression: {best_model} - RMSE {rmse.round(2)} days, MAE {mae.round(2)} days"
)

fig.suptitle("Predicted vs Actual risk");

For our best trained model, the regression approach better captures the overall risk of becoming a long stayer.

## Extensions

* Add number of predictions to bins in plots using e.g. https://stackoverflow.com/questions/30228069/how-to-display-the-value-of-the-bar-on-each-bar-with-pyplot-barh
* Refactor visualisation code