In [2]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn import metrics
from tqdm import tqdm_notebook as tqdm

import altair as alt
alt.renderers.enable('html')

RendererRegistry.enable('html')

In [None]:
TRUTH_LABELS = "~/test_datasets/dr_binary.csv"
PRED_LABELS = "~/test_datasets/dr_binary_predictions.csv"
BINARY_LABEL = 'y'
BOOTSTRAP_ITERATIONS = 100

Read truth and predicted labels. We are only interested in the selected binary label and its score in the predicted dataframe.

In [None]:
df_truth = pd.read_csv(TRUTH_LABELS)
df_truth = pd.read_csv(TRUTH_LABELS).set_index("image_id")
df_pred = pd.read_csv(PRED_LABELS)
df_pred = df_pred[df_pred.label == BINARY_LABEL]
df_pred = df_pred.set_index("image_id")
df_merged = pd.concat([
    df_truth.label == BINARY_LABEL,
    df_truth.dr_grade,
    df_pred.score
], axis=1, sort=True).dropna()
df_merged = df_merged.reset_index()

df_merged.head()

In [None]:
def calculate_scores_df(df_bootstrapped, num_steps=100):
    df_truth = df_bootstrapped.label.astype(np.bool)
    scores_list = []
    for score_threshold in np.linspace(0, 1, num_steps):
        df_pred = df_bootstrapped.score >= score_threshold
        scores = {
            "score_threshold": score_threshold,
            "accuracy": metrics.accuracy_score(df_truth, df_pred),
            "sensitivity": metrics.recall_score(df_truth, df_pred),
            "specificity": metrics.recall_score(~df_truth, ~df_pred),
            "f1": metrics.f1_score(df_truth, df_pred),
        }
        # Calculate TPR and FPR to plot the ROC later on
        scores["tpr"] = scores["sensitivity"]
        scores["fpr"] = 1 - scores["specificity"]
        scores_list.append(scores)
    scores_df = pd.DataFrame(scores_list)
    scores_df["auc"] = metrics.roc_auc_score(df_truth, df_bootstrapped.score)
    return scores_df

Bootstrap dataframe `BOOTSTRAP_ITERATIONS` times and calculate metrics on each iteration

In [None]:
df_scores_sample = calculate_scores_df(df_merged, num_steps=50)
df_scores_sample = df_scores_sample.set_index("score_threshold")

scores_bootstrapped_list = []
for it in tqdm(range(BOOTSTRAP_ITERATIONS), desc="Bootstrapping..."):
    df_bootstrapped = df_merged.sample(frac=1.0, replace=True)
    scores_df = calculate_scores_df(df_bootstrapped, num_steps=50)
    scores_df["bootstrap_iteration"] = it
    scores_bootstrapped_list.append(scores_df)
df_scores_bootstrapped = pd.concat(scores_bootstrapped_list).sort_values("score_threshold").reset_index(drop=True)
df_scores_bootstrapped = df_scores_bootstrapped.set_index(["score_threshold", "bootstrap_iteration"])

df_scores_bootstrapped.head()

Now calculate 95% confidence intervals by performing an empirical bootstrap

In [None]:
INTERVAL = 0.95
df_scores_mean_diff = df_scores_bootstrapped - df_scores_sample
scores_bootstrapped_list = []
for score_threshold in np.linspace(0, 1, 50):
    df_view = df_scores_mean_diff.loc[score_threshold]
    for score_name in df_view.columns:
        sorted_diff_scores = df_view[score_name].sort_values()
        idx_min = round((1 - INTERVAL) * len(sorted_diff_scores))
        idx_max = round((INTERVAL) * len(sorted_diff_scores))
        mean = df_scores_sample.loc[score_threshold, score_name]
        c0 = mean - sorted_diff_scores.iloc[idx_min]
        c1 = mean - sorted_diff_scores.iloc[idx_max]
        scores = {
            "score_threshold": score_threshold,
            "score_type": score_name,
            "mean": mean,
            "c0": c0,
            "c1": c1
        }
        scores_bootstrapped_list.append(scores)
df_scores_ci = pd.DataFrame(scores_bootstrapped_list)
df_scores_ci.head()

In [None]:
df_mean = df_scores_ci.pivot("score_threshold", "score_type", "mean" )
df_c0 = df_scores_ci.pivot("score_threshold", "score_type", "c0" )
df_c1 = df_scores_ci.pivot("score_threshold", "score_type", "c1" )

In [None]:
df_scores_ci[df_scores_ci.score_type=="auc"]

In [None]:
a = df_scores_ci.pivot("score_threshold", "score_type", ["mean", "c0", "c1"] )

In [None]:
b = pd.concat([
    a["mean"]["fpr"],
    a["c0"]["tpr"],
    a["c1"]["tpr"],
], axis=1
)
b.columns = ["fpr", "tpr_c0", "tpr_c1"]
b["model"] = "model1"
df_mean["model"] = "model1"

In [None]:
auc_score = df_mean.auc.iloc[0]
output_chart = alt.Chart(df_mean, height=500, width=800).properties(
    title="ROC curve (AUC %.03f)" % auc_score
)
band = alt.Chart(b).mark_area(opacity=0.3).encode(
    x='fpr',
    y='tpr_c0',
    y2='tpr_c1',
    color='model'
)

roc_curve = output_chart.mark_line().encode(
    x='fpr',
    y='tpr',
    color='model'
)
threshold_markers = output_chart.mark_circle().encode(
    x='fpr',
    y='tpr',
    tooltip=['accuracy', 'f1', 'sensitivity', 'specificity', 'tpr', 'fpr'],
    color='model'
)
(band + roc_curve + threshold_markers).interactive()

In [None]:
max_accuracy_threshold = df_mean.accuracy.argmax()
df_mean.loc[max_accuracy_threshold]

In [None]:
#(df_merged.score >= max_accuracy_threshold)
pd.pivot_table(pd.concat([df_merged.label, df_merged.score >= max_accuracy_threshold], axis=1).reset_index(), index="score", columns="label", aggfunc="count", fill_value=0)

In [None]:
pd.pivot_table(pd.concat([df_merged.dr_grade, df_merged.score >= max_accuracy_threshold], axis=1).reset_index(), index="score", columns="dr_grade", aggfunc="count", fill_value=0)