# Model comparison

We have now trained a Regression model (length of stay) and a Classification model (risk 1-5).

We can create an equivalent risk model from the risk categories using the predicted Length of Stay:

Risk Category|Day Range for Risk Category
-----|------
1 - Very low risk|0-6
2 - Low risk|7-10
3 - Normal risk|11-13
4 - Elevated risk|14-15
5 - High risk|>15

We have a number of approaches where we will compare side by side plots for risk stratification:

Model|Regression version|Classification version
---|---|---
Dummy|Mean|Prior
Simple|LogReg|ElasticNet
Decision Tree|DecisionTreeRegressor|DecisionTreeClassifier
Random Forest|RandomForestRegressor|RandomForestClassifier
Catboost|CatBoostRegressor|CatBoostClassifier
XGBoost|XGBRegressor|XGBClassifier


In [None]:
import math
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

%matplotlib inline
plt.rcParams["figure.figsize"] = [15, 8]

In [None]:
# Helper functions


def risk_score(los):
    """Return risk score (1-5) based on LoS

    Parameters:
        los (float): length of stay in days

    Returns:
        (int): risk score (1 = Very low risk, 5 = High risk)
    """

    # round los up to whole days
    los = math.ceil(los)

    if los > 15:
        return 5
    elif los > 13:
        return 4
    elif los > 10:
        return 3
    elif los > 6:
        return 2
    else:
        return 1

## Load data

In [None]:
features_df = pd.read_parquet("../../data/features.parquet")
features_catboost_df = pd.read_parquet("../../data/features-catboost.parquet")
# add actual risk scores
risk_labels = [
    "1 - Very Low Risk",
    "2 - Low Risk",
    "3 - Normal Risk",
    "4 - Elevated Risk",
    "5 - High Risk",
]
features_df["risk"] = [risk_score(los) for los in features_df.LENGTH_OF_STAY]
features_catboost_df["risk"] = [
    risk_score(los) for los in features_catboost_df.LENGTH_OF_STAY
]
# separate training and target features
X = features_df.drop(columns=["LENGTH_OF_STAY"])
y = features_df.risk

# non-one-hot encoded data for catboost
X_catboost = features_catboost_df.drop(columns=["LENGTH_OF_STAY"])
y_catboost = features_catboost_df.risk

# separate training and test data
# split data for train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.75, random_state=42
)
print(X_train.shape, X_test.shape)

# Scale data for LogReg only using training data
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = pd.DataFrame(
    scaler.transform(X_train), index=X_train.index, columns=X_train.columns
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test), index=X_test.index, columns=X_test.columns
)
print(X_train_scaled.shape, X_test_scaled.shape)

# Split data for train/test
X_train_catboost, X_test_catboost, y_train_catboost, y_test_catboost = train_test_split(
    X_catboost, y_catboost, train_size=0.75, random_state=42
)
print(X_train_catboost.shape, X_test_catboost.shape)

## Load models

In [None]:
# load models from outside the git tree
with open("../../models/regression.pickle", "rb") as handle:
    models_regression = pickle.load(handle)
models_regression

In [None]:
# load models from outside the git tree
with open("../../models/classification.pickle", "rb") as handle:
    models_classification = pickle.load(handle)
models_classification

## Compare predicted risk score (classification) with equivalent-predicted risk score (regression)

Classification -> Risk score

Regression -> Length Of Stay -> Equivalent risk score

In [None]:
# setup a subplot figure
fig, axs = plt.subplots(len(models_classification), 2)
fig.set_size_inches(15, 7 * len(models_classification))

i = 0

for model_classification in models_classification:
    if model_classification == "catboost":
        model_classification_X_test = X_test_catboost.drop(columns="risk")
        model_classification_y_test = y_test
    elif model_classification == "logreg":
        model_classification_X_test = X_test_scaled.drop(columns="risk")
        model_classification_y_test = y_test
    else:
        model_classification_X_test = X_test.drop(columns="risk")
        model_classification_y_test = y_test

    # logreg is being compared to elastic net regression
    if model_classification == "logreg":
        model_regression_X_test = X_test.drop(columns="risk")
        model_regression_y_test = y_test
    else:
        model_regression_X_test = model_classification_X_test
        model_regression_y_test = model_classification_y_test

    # logreg is being compared to elastic net regression
    # prior is being compared to mean
    if model_classification == "logreg":
        model_regression = "elastic"
    elif model_classification == "prior":
        model_regression = "mean"
    else:
        model_regression = model_classification

    # perform inference
    preds_regression = models_regression[model_regression]["model"].predict(
        model_regression_X_test
    )
    preds_classification = models_classification[model_classification]["model"].predict(
        model_classification_X_test
    )

    # create a prediction dataframe
    predictions_df = pd.DataFrame(
        data=model_classification_y_test.reset_index(drop=True)
    )
    predictions_df["pred_regression_los"] = preds_regression
    # calculate equivalent risk score from regression model
    predictions_df["pred_regression"] = [
        risk_score(los) for los in predictions_df.pred_regression_los
    ]
    predictions_df["pred_classification"] = preds_classification

    # plot actual vs predicted CLASSES for classification
    risks = dict.fromkeys(risk_labels)
    for proportion in risks:
        risks[proportion] = np.array([0.0, 0.0, 0.0, 0.0, 0.0])

        for label in risk_labels:
            this_risk = int(label[0])

            # extract the real risk
            subset = predictions_df[predictions_df.risk == this_risk]

            if proportion == "1 - Very Low Risk":
                prop = (subset.pred_classification == 1).sum() / subset.shape[0]
            elif proportion == "2 - Low Risk":
                prop = (subset.pred_classification == 2).sum() / subset.shape[0]
            elif proportion == "3 - Normal Risk":
                prop = (subset.pred_classification == 3).sum() / subset.shape[0]
            elif proportion == "4 - Elevated Risk":
                prop = (subset.pred_classification == 4).sum() / subset.shape[0]
            else:
                prop = (subset.pred_classification == 5).sum() / subset.shape[0]

            risks[proportion][this_risk - 1] = prop

    bottom = np.array([0.0, 0.0, 0.0, 0.0, 0.0])
    for proportion in risks:
        if proportion == "1 - Very Low Risk":
            data = risks[proportion]
            axs[i, 0].bar(risk_labels, data, label=proportion, width=0.35)
        else:
            bottom += data
            data = risks[proportion]
            axs[i, 0].bar(
                risk_labels, data, label=proportion, bottom=bottom, width=0.35
            )
    # skip legend as will be same on RHS
    # handles, labels = axs[i, 0].get_legend_handles_labels()
    # axs[i, 0].legend(handles[::-1], labels[::-1], bbox_to_anchor=(1.05, 1))
    axs[i, 0].set_xlabel("Actual risk")
    axs[i, 0].set_ylabel("Predicted risk")
    axs[i, 0].set_title(f"classification: {model_classification}")

    # plot actual vs predicted CLASSES for regression
    risks = dict.fromkeys(risk_labels)
    for proportion in risks:
        risks[proportion] = np.array([0.0, 0.0, 0.0, 0.0, 0.0])

        for label in risk_labels:
            this_risk = int(label[0])

            # extract the real risk
            subset = predictions_df[predictions_df.risk == this_risk]

            if proportion == "1 - Very Low Risk":
                prop = (subset.pred_regression == 1).sum() / subset.shape[0]
            elif proportion == "2 - Low Risk":
                prop = (subset.pred_regression == 2).sum() / subset.shape[0]
            elif proportion == "3 - Normal Risk":
                prop = (subset.pred_regression == 3).sum() / subset.shape[0]
            elif proportion == "4 - Elevated Risk":
                prop = (subset.pred_regression == 4).sum() / subset.shape[0]
            else:
                prop = (subset.pred_regression == 5).sum() / subset.shape[0]

            risks[proportion][this_risk - 1] = prop

    bottom = np.array([0.0, 0.0, 0.0, 0.0, 0.0])
    for proportion in risks:
        if proportion == "1 - Very Low Risk":
            data = risks[proportion]
            axs[i, 1].bar(risk_labels, data, label=proportion, width=0.35)
        else:
            bottom += data
            data = risks[proportion]
            axs[i, 1].bar(
                risk_labels, data, label=proportion, bottom=bottom, width=0.35
            )
    handles, labels = axs[i, 1].get_legend_handles_labels()
    axs[i, 1].legend(handles[::-1], labels[::-1], bbox_to_anchor=(1.05, 1))
    axs[i, 1].set_xlabel("Actual risk")
    axs[i, 1].set_ylabel("Predicted risk")
    axs[i, 1].set_title(f"regression: {model_regression}")
    i += 1

## Extensions

* Refactor visualisation code