# Model Report

In [None]:
import sys, os
import pandas as pd
import seaborn as sns

# get access to the src directory
sys.path.append((os.path.abspath("workflow")))

## Read in data

In [None]:
import pickle
from gzip import GzipFile

models = {}
for fn in snakemake.input:
    with GzipFile(fn, "rb") as f:
        m = pickle.load(f)
    models[m.id_] = m

## ROC

In [None]:
from sklearn.metrics import roc_curve, auc

# Wrangle
plot_df = []
for id, m in models.items():
    for fold in m.res_:
        roc = pd.DataFrame()
        roc["fpr"], roc["tpr"], roc["threshold"] = roc_curve(
            m.res_[fold]["test"]["label"],
            m.res_[fold]["test"]["proba_KNRGL"],
            pos_label="KNRGL",
        )
        roc["fold"] = fold
        roc["model"] = id
        roc["auc"] = auc(roc["fpr"], roc["tpr"])
        plot_df.append(roc)

# Average over folds
plot_df = (
    pd.concat(plot_df)
    .groupby(["model", "threshold"])
    .mean(["fpr", "fpr", "auc"])
    .reset_index()
)

# Plot
sns.set_style("ticks")
fig = sns.lineplot(
    data=plot_df,
    x="fpr",
    y="tpr",
    hue="model",
)

fig.axes.axline(
    xy1=(0, 0), slope=1, linestyle="--", color="gray", linewidth=1
)  # Draw a line of x=y
fig.set(
    xlim=(0, 1), ylim=(0, 1), xlabel="False Positive Rate", ylabel="True Positive Rate"
)  # set axis limits and labels
sns.despine()

## Theoretical PPV

In [None]:
from sklearn.metrics import confusion_matrix

bal = [1, 10, 100, 1000, 10000, 100000]

# Wrangle
plot_df = []
for id, m in models.items():
    for fold in m.res_:
        for b in bal:
            cm = confusion_matrix(
                m.res_[fold]["test"]["label"],
                m.res_[fold]["test"]["pred"],
                normalize="true",
                labels=["KNRGL", "OTHER"],
            )
            ppv = {}
            ppv["tpr"] = cm[0, 0]
            ppv["fpr"] = cm[1, 0]
            ppv["nKNRGL"] = 1
            ppv["nOTHER"] = b
            ppv["tp"] = ppv["nKNRGL"] * ppv["tpr"]
            ppv["fp"] = ppv["nOTHER"] * ppv["fpr"]
            ppv["ppv"] = ppv["tp"] / (ppv["tp"] + ppv["fp"])
            ppv["fold"] = fold
            ppv["model"] = id
            plot_df.append(ppv)

plot_df = (
    pd.DataFrame.from_records(plot_df)
    .groupby(["model", "nOTHER"])
    .mean(["tp", "fp", "ppv", "tpr", "fpr"])
    .reset_index()
)

fig = sns.lineplot(data=plot_df, x="nOTHER", y="ppv", hue="model", markers=True)
fig.set(xscale="log")
fig.set(ylabel="Positive Predictive Value (TP/TP+FP)", xlabel="OTHER:KNRGL class ratio")
sns.despine()

## Feature Importances

In [None]:
from functools import reduce


def get_fi(estimator):
    df = pd.DataFrame()
    df["feature"] = estimator.feature_names_in_
    df["importance"] = estimator.feature_importances_
    return df


fi_list = []
for id, m in models.items():
    if "feature_importances_" not in dir(m.res_[0]["clf"]._final_estimator):
        print(f"{id} does not have feature importances")
        continue
    fi = [get_fi(m.res_[fold]["clf"]._final_estimator) for fold in m.res_]
    df = reduce(lambda x, y: pd.merge(x, y, on="feature"), fi).set_index("feature")
    df["mean_importance"] = df.mean(axis=1)
    df["model_id"] = id
    df.sort_values("mean_importance", ascending=False, inplace=True)
    df.reset_index(inplace=True)
    fi_list.append(df)

fi_df = pd.concat(fi_list)
fig = sns.FacetGrid(
    fi_df, col="model_id", sharey=False, sharex=False, height=12, aspect=0.6
)
fig.map_dataframe(
    sns.pointplot,
    x="mean_importance",
    y="feature",
    join=False,
)
sns.despine()