# Baselines evaluation 

In [None]:
config_name = "medmnist/pathmnist_resnet_dropout_all_layers.yml"
scores_filename = ""

In [None]:
import sys

root = "/data/failure_detection"
sys.path.append(root)
from configs.default_config import load_yaml_training_config
from failure_detection.evaluator import ThresholdBasedEvaluator
from failure_detection.run_evaluation import get_all_scores
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.calibration import CalibrationDisplay
import seaborn as sns
import matplotlib.pyplot as plt
import os
import pandas as pd
from pathlib import Path

root = "/data/failure_detection"
config = load_yaml_training_config(Path(root) / "configs" / config_name)

if not Path(scores_filename).exists():
    print("Score dataframe not created yet, running scores script")
    scores_df = get_all_scores(config_name)
else:
    scores_df = pd.read_csv(scores_filename)

if config.n_classes > 2:
    scores_df["mcmc_probas"] = None
    scores_df["Probas"] = None
    scores_df["Laplace_probas"] = None
    scores_df["SWAG_probas"] = None

## A look at the model

In [None]:
f, ax = plt.subplots(1, 1, figsize=(8, 8))
ConfusionMatrixDisplay.from_predictions(
    scores_df["Targets"].astype(int).values,
    scores_df["Predictions"].astype(int).values,
    normalize="true",
    ax=ax,
)

In [None]:
from sklearn.metrics import accuracy_score

print(
    f'Acc: {accuracy_score(scores_df["Targets"].astype(int).values, scores_df["Predictions"].astype(int).values):.3f}'
)

In [None]:
if config.n_classes == 2:
    CalibrationDisplay.from_predictions(
        scores_df["Targets"].values, scores_df["Probas"].values, n_bins=10
    )

In [None]:
if config.n_classes == 2:
    sns.histplot(x=scores_df["Probas"], hue=scores_df["Targets"], multiple="stack")
    plt.title("Distribution of p(y=1) by target (on test set)")

## Get Softmax baseline results

In [None]:
evaluator_baseline = ThresholdBasedEvaluator(
    scores_df["Baseline"],
    scores_df["Predictions"],
    scores_df["Targets"],
    scores_df["Probas"],
    "Baseline",
)
all_metrics = evaluator_baseline.get_new_metrics();

In [None]:
all_metrics

## Get DOCTOR results

In [None]:
if "doctor_alpha" in scores_df.columns:
    evaluator_alpha = ThresholdBasedEvaluator(
        scores_df["doctor_alpha"],
        scores_df["Predictions"],
        scores_df["Targets"],
        scores_df["Probas"],
        "DoctorAlpha",
    )
    all_metrics = all_metrics.append(
        evaluator_alpha.get_new_metrics(), ignore_index=True
    )

if "doctor_alpha_pbb" in scores_df.columns:
    evaluator_alpha_pbb = ThresholdBasedEvaluator(
        scores_df["doctor_alpha_pbb"],
        scores_df["Predictions"],
        scores_df["Targets"],
        scores_df["Probas"],
        "DoctorAlphaPBB",
    )
    all_metrics = all_metrics.append(
        evaluator_alpha_pbb.get_new_metrics(), ignore_index=True
    )

## Get TrustScore results

In [None]:
if "TrustScore" in scores_df.columns:
    evaluator_trustscore = ThresholdBasedEvaluator(
        scores_df["TrustScore"],
        scores_df["Predictions"],
        scores_df["Targets"],
        scores_df["Probas"],
        "Trust Score",
    )
    all_metrics = all_metrics.append(
        evaluator_trustscore.get_new_metrics(), ignore_index=True
    )

## MCMC dropout

In [None]:
if "mcmc_soft_scores" in scores_df.columns:
    evaluator_mcmcmean_agg = ThresholdBasedEvaluator(
        scores_df["mcmc_soft_scores"],
        scores_df["mcmc_predictions"],
        scores_df["Targets"],
        scores_df["mcmc_probas"],
        "MCMC Average Softmax score - Agg pred",
    )
    all_metrics = all_metrics.append(
        evaluator_mcmcmean_agg.get_new_metrics(), ignore_index=True
    )
    evaluator_mcmcmean_agg = ThresholdBasedEvaluator(
        scores_df["mcmc_entropy_scores"],
        scores_df["mcmc_predictions"],
        scores_df["Targets"],
        scores_df["mcmc_probas"],
        "MCMC Entropy score - Agg pred",
    )
    all_metrics = all_metrics.append(
        evaluator_mcmcmean_agg.get_new_metrics(), ignore_index=True
    )
    all_metrics

## Laplace

In [None]:
if "Laplace_predictions" in scores_df.columns:
    evaluator_laplace = ThresholdBasedEvaluator(
        scores_df["Laplace_score"],
        scores_df["Laplace_predictions"],
        scores_df["Laplace_targets"],
        scores_df["Laplace_probas"],
        "Laplace",
    )
    all_metrics = all_metrics.append(
        evaluator_laplace.get_new_metrics(), ignore_index=True
    )

## ConfidNet

In [None]:
if "ConfidNet_scores" in scores_df.columns:
    evaluator_confidNet = ThresholdBasedEvaluator(
        scores_df["ConfidNet_scores"],
        scores_df["Predictions"],
        scores_df["Targets"],
        scores_df["Probas"],
        "ConfidNet",
    )
    all_metrics = all_metrics.append(
        evaluator_confidNet.get_new_metrics(), ignore_index=True
    )

## SWAG

In [None]:
if "SWAG_score" in scores_df.columns:
    evaluator_swag = ThresholdBasedEvaluator(
        scores_df["SWAG_score"],
        scores_df["SWAG_predictions"],
        scores_df["SWAG_targets"],
        scores_df["SWAG_probas"],
        "SWAG",
    )
    all_metrics = all_metrics.append(
        evaluator_swag.get_new_metrics(), ignore_index=True
    )

## DUQ

In [None]:
if "DUQ_score" in scores_df.columns:
    evaluator_duq = ThresholdBasedEvaluator(
        scores_df["DUQ_score"],
        scores_df["DUQ_predictions"],
        scores_df["Targets"],
        scores_df["DUQ_probas"],
        "DUQ",
    )
    all_metrics = all_metrics.append(evaluator_duq.get_new_metrics(), ignore_index=True)

In [None]:
all_metrics.to_csv(Path(scores_filename).parent / f"all_metrics.csv", index=False)
all_metrics