## 6) Bounded DTW & Calibration

- **Raw** DTW distances normalized by various methods → compute AUC & EER  
- **Calibrate** the best normalization (`d_by_avg_len`) via logistic regression  
- Report both raw and calibrated AUC/EER, save to disk

In [None]:

import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from sklearn.metrics import roc_curve, roc_auc_score

# make sure `src/` is importable
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

# helpers
from src.evaluation.evaluation import load_results, compute_metrics, plot_roc, plot_det
from src.calibration.calibration import add_normalizations, train_calibrator


# paths
PAIRS_PATH  = project_root/"data"/"pairs_meta.parquet"
CACHE_PATH  = project_root/"data"/"dtw_cache.parquet"
FIG_DIR     = project_root/"figures"
RESULTS_DIR = project_root/"results"
FIG_DIR.mkdir(exist_ok=True, parents=True)
RESULTS_DIR.mkdir(exist_ok=True, parents=True)

# 6.1) Load & merge
df = load_results(PAIRS_PATH, CACHE_PATH)

# 6.2) Add four simple normalizations
df_norm = add_normalizations(df)

# 6.3) Evaluate each normalization’s AUC & EER
methods = ['d_by_path','d_by_ref_len','d_by_qry_len','d_by_avg_len']
norm_results = {}
for m in methods:
    res = compute_metrics(df_norm, score_col=m)
    norm_results[m] = {'auc': res['auc'], 'eer': res['eer']}
# display table of raw performance
display(pd.DataFrame(norm_results).T)
 
# 6.4) Plot ROC for the best normalization (here 'd_by_avg_len')
best = 'd_by_avg_len'
res_best = compute_metrics(df_norm, score_col=best)

fig, ax = plt.subplots(figsize=(6,6))
plot_roc(res_best['fpr'], res_best['tpr'], res_best['auc'], ax=ax)
ax.set_title(f"Raw ROC ({best})")
fig.savefig(FIG_DIR/f"roc_{best}.png", dpi=300, bbox_inches="tight")
plt.close(fig)

# 6.5) Train & evaluate a logistic calibrator on that best feature
model, X_test, y_test, y_score_test = train_calibrator(
    df_norm, feature_col=best, method='logistic'
)

# calibrated ROC & AUC
fpr_c, tpr_c, thr_c = roc_curve(y_test, y_score_test)
auc_c = roc_auc_score(y_test, y_score_test)

fig, ax = plt.subplots(figsize=(6,6))
ax.plot(fpr_c, tpr_c, label=f"AUC={auc_c:.3f}", lw=2)
ax.plot([0,1],[0,1],'--',color='gray')
ax.set_xlabel("False Positive Rate")
ax.set_ylabel("True Positive Rate")
ax.set_title(f"Calibrated ROC ({best})")
ax.legend(loc="lower right")
fig.savefig(FIG_DIR/f"roc_calibrated_{best}.png", dpi=300, bbox_inches="tight")
plt.close(fig)

# 6.6) Compute calibrated EER & report summary
fnr_c = 1 - tpr_c
idx_c = np.argmin(np.abs(fpr_c - fnr_c))
eer_c = float((fpr_c[idx_c] + fnr_c[idx_c]) / 2)
thr_eer_c = float(thr_c[idx_c])

raw_auc = norm_results[best]['auc']
raw_eer = norm_results[best]['eer']

print(f"Raw     {best} → AUC {raw_auc:.4f}, EER {raw_eer:.4f}")
print(f"Calibrated      → AUC {auc_c:.4f}, EER {eer_c:.4f} @ threshold {thr_eer_c:.4f}")


Raw     d_by_avg_len AUC = 0.3104, EER = 0.6476
Calibrated      AUC = 0.6922
