In [None]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from sklearn.metrics import (
    roc_auc_score, average_precision_score, brier_score_loss,
    roc_curve, precision_recall_curve
)

A_PATH = Path("modelA_preds.csv")
B_PATH = Path("modelB_preds.csv")
THRESH = 0.50  

import numpy as np

_EPS = 1e-9

def _expit(z):
    return 1.0 / (1.0 + np.exp(-z))

def _logit(p):
    p = np.clip(np.asarray(p, float), _EPS, 1.0 - _EPS)
    return np.log(p / (1.0 - p))

def ci_logit_percentile_01(x, lo=2.5, hi=97.5):
    z = _logit(np.clip(np.asarray(x, float), _EPS, 1.0 - _EPS))
    lz, hz = np.nanpercentile(z, [lo, hi])
    return float(_expit(lz)), float(_expit(hz))

def ci_logit_percentile_diff01(x, lo=2.5, hi=97.5):
    x = np.asarray(x, float)
    z01 = (np.clip(x, -1.0 + _EPS, 1.0 - _EPS) + 1.0) / 2.0
    lo01, hi01 = ci_logit_percentile_01(z01, lo=lo, hi=hi)
    return 2.0 * lo01 - 1.0, 2.0 * hi01 - 1.0

def _pick_col(df, candidates, required=False, name=""):
    for c in candidates:
        if c in df.columns:
            return c
    if required:
        raise ValueError(f"Missing required column for {name}. Tried {candidates}")
    return None

def load_preds(path: Path, tag: str) -> pd.DataFrame:
    df = pd.read_csv(path, dtype={"SEQN":"string","SDMVPSU":"string","SDMVSTRA":"string"})
    df.columns = df.columns.str.strip()

    c_id = _pick_col(df, ["SEQN","seqn"], True, "SEQN")
    c_y  = _pick_col(df, ["y","y2","Y"], True, "outcome")
    c_p  = _pick_col(df, ["p","p_cal","p2","p_uncal"], True, "probability")
    c_w  = _pick_col(df, ["w","w2"], False, "weight")

    keep = {c_id:"SEQN", c_y:f"y_{tag}", c_p:f"p_{tag}"}
    if c_w: keep[c_w] = f"w_{tag}"
    if "SDMVPSU" in df.columns: keep["SDMVPSU"] = f"SDMVPSU_{tag}"
    if "SDMVSTRA" in df.columns: keep["SDMVSTRA"] = f"SDMVSTRA_{tag}"

    return df[list(keep.keys())].rename(columns=keep).reset_index(drop=True)

A = load_preds(A_PATH, "A")
B = load_preds(B_PATH, "B")

M = A.merge(B, on="SEQN", how="inner", validate="one_to_one", suffixes=("_A","_B"))

def coalesce_design(df: pd.DataFrame, base: str) -> pd.DataFrame:
    a, b = f"{base}_A", f"{base}_B"
    if base in df.columns: return df
    has_a, has_b = a in df.columns, b in df.columns
    if not (has_a or has_b): return df
    if has_a and has_b:
        same = (df[a].fillna("").astype(str) == df[b].fillna("").astype(str))
        if not bool(same.all()):
            print(f"WARNING: {base}_A and {base}_B differ; using {a}.")
        df[base] = df[a]
    else:
        df[base] = df[a] if has_a else df[b]
    return df

M = coalesce_design(M, "SDMVPSU")
M = coalesce_design(M, "SDMVSTRA")

y = M["y_A"].to_numpy(int)
if not np.array_equal(M["y_A"].to_numpy(), M["y_B"].to_numpy()):
    raise RuntimeError("Outcome mismatch between files—ensure same test cohort & labels.")

pA = M["p_A"].to_numpy(float)
pB = M["p_B"].to_numpy(float)

if "w_A" in M.columns and "w_B" in M.columns and np.allclose(M["w_A"], M["w_B"], equal_nan=True):
    w = M["w_A"].fillna(0.0).to_numpy(float)
elif "w_A" in M.columns:
    w = M["w_A"].fillna(0.0).to_numpy(float)
elif "w_B" in M.columns:
    w = M["w_B"].fillna(0.0).to_numpy(float)
else:
    w = np.ones(len(M), dtype=float)

psu  = M["SDMVPSU"].astype(str).to_numpy()  if "SDMVPSU"  in M.columns else None
stra = M["SDMVSTRA"].astype(str).to_numpy() if "SDMVSTRA" in M.columns else None


def metrics_w(y, p, w):
    auc = roc_auc_score(y, p, sample_weight=w)
    ap  = average_precision_score(y, p, sample_weight=w)
    b   = brier_score_loss(y, p, sample_weight=w)
    return auc, ap, b

def confusion_at(y, p, w, thr):
    yhat = (p >= thr).astype(int)
    pos = (y == 1); neg = ~pos
    tp = float(np.sum(w[(yhat==1) & pos])); fn = float(np.sum(w[(yhat==0) & pos]))
    tn = float(np.sum(w[(yhat==0) & neg])); fp = float(np.sum(w[(yhat==1) & neg]))
    sens = tp/(tp+fn) if tp+fn>0 else np.nan
    spec = tn/(tn+fp) if tn+fp>0 else np.nan
    ppv  = tp/(tp+fp) if tp+fp>0 else np.nan
    npv  = tn/(tn+fn) if tn+fn>0 else np.nan
    return sens, spec, ppv, npv

aucA, apA, bA = metrics_w(y, pA, w)
aucB, apB, bB = metrics_w(y, pB, w)

print("=== Point estimates (weighted) ===")
print(f"Model A: AUC={aucA:.3f} | PR-AUC={apA:.3f} | Brier={bA:.3f}")
print(f"Model B: AUC={aucB:.3f} | PR-AUC={apB:.3f} | Brier={bB:.3f}")


def pr_baseline(y, w):
    tot = np.sum(w)
    return float(np.sum(w[y==1]) / tot) if tot > 0 else 0.0  


plt.figure(figsize=(7,6))
for p, name in [(pA,"A"), (pB,"B")]:
    fpr, tpr, _ = roc_curve(y, p, sample_weight=w)
    auc = roc_auc_score(y, p, sample_weight=w)
    plt.plot(fpr, tpr, label=f"{name} (AUC={auc:.3f})")
plt.plot([0,1],[0,1], "--", alpha=0.6)
plt.xlabel("1 - Specificity"); plt.ylabel("Sensitivity"); plt.title("TEST ROC (weighted)")
plt.legend(); plt.tight_layout(); plt.savefig("compare_roc.png", dpi=150)


prev = pr_baseline(y, w)
plt.figure(figsize=(7,6))
for p, name in [(pA,"A"), (pB,"B")]:
    prec, rec, _ = precision_recall_curve(y, p, sample_weight=w)
    ap = average_precision_score(y, p, sample_weight=w)
    plt.plot(rec, prec, label=f"{name} (AP={ap:.3f})")
plt.axhline(prev, linestyle="--", alpha=0.7, label=f"Baseline (prev={prev:.3f})")
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("TEST PR (weighted)")
plt.legend(); plt.tight_layout(); plt.savefig("compare_pr.png", dpi=150)

def _weighted_quantile(x, w, q):
    order = np.argsort(x); xs, ws = x[order], w[order]
    cdf = np.cumsum(ws)/np.sum(ws)
    return np.interp(q, cdf, xs)

def weighted_calibration_curve(y, p, w, n_bins=10, strategy="quantile"):
    y, p, w = y.astype(int), p.astype(float), np.clip(w.astype(float), 0.0, np.inf)
    if strategy == "quantile":
        edges = _weighted_quantile(p, w, np.linspace(0,1,n_bins+1))
        edges[0], edges[-1] = 0.0, 1.0
    else:
        edges = np.linspace(0,1,n_bins+1)
    edges = np.unique(edges)
    idx = np.digitize(p, edges[1:-1], right=False)
    pm, tm, bw = [], [], []
    for b in range(len(edges)-1):
        m = (idx == b); wb = w[m]
        if wb.sum() <= 0: continue
        pm.append(np.average(p[m], weights=wb))
        tm.append(np.average(y[m], weights=wb))
        bw.append(wb.sum())
    return np.array(pm), np.array(tm), edges, np.array(bw)

def weighted_ece(y, p, w, n_bins=10):
    pm, tm, _, bw = weighted_calibration_curve(y, p, w, n_bins=n_bins)
    if bw.sum() == 0 or len(bw) == 0: return np.nan
    return float(np.sum((bw / bw.sum()) * np.abs(tm - pm)))

eceA = weighted_ece(y, pA, w, n_bins=10)
eceB = weighted_ece(y, pB, w, n_bins=10)
pmA, tmA, _, _ = weighted_calibration_curve(y, pA, w, n_bins=10)
pmB, tmB, _, _ = weighted_calibration_curve(y, pB, w, n_bins=10)

plt.figure(figsize=(7,6))
plt.plot([0,1],[0,1], "--", lw=1, label="Perfect")
plt.plot(pmA, tmA, "o-", label=f"A (ECE={eceA:.3f}, Brier={bA:.3f})")
plt.plot(pmB, tmB, "o-", label=f"B (ECE={eceB:.3f}, Brier={bB:.3f})")
plt.xlabel("Mean predicted probability"); plt.ylabel("Observed positive fraction")
plt.title("TEST Calibration (weighted)"); plt.legend(); plt.tight_layout()
plt.savefig("compare_calibration.png", dpi=150)

def reclass_table(y, p_old, p_new, w, thr):
    a = (p_old >= thr).astype(int)
    b = (p_new >= thr).astype(int)
    def wsum(mask): return float(np.sum(w[mask]))
    tbl = pd.DataFrame({
        "down": [wsum((a==1)&(b==0)&(y==1)), wsum((a==1)&(b==0)&(y==0))],
        "stay": [wsum((a==1)&(b==1)&(y==1))+wsum((a==0)&(b==0)&(y==1)),
                 wsum((a==1)&(b==1)&(y==0))+wsum((a==0)&(b==0)&(y==0))],
        "up":   [wsum((a==0)&(b==1)&(y==1)), wsum((a==0)&(b==1)&(y==0))]
    }, index=["event","nonevent"])
    w_event = float(np.sum(w[y==1])); w_none = float(np.sum(w[y==0]))
    p_up_e   = tbl.loc["event","up"]   / w_event if w_event>0 else np.nan
    p_down_e = tbl.loc["event","down"] / w_event if w_event>0 else np.nan
    p_up_ne  = tbl.loc["nonevent","up"]/ w_none  if w_none>0 else np.nan
    p_down_ne= tbl.loc["nonevent","down"]/w_none if w_none>0 else np.nan
    nri = (p_up_e - p_down_e) + (p_down_ne - p_up_ne)
    return tbl, nri

tbl, nri = reclass_table(y, pA, pB, w, THRESH)
print(f"\n=== Weighted reclassification at thr={THRESH:.3f} (A -> B) ===")
print(tbl)
print(f"NRI (category-free at this thr): {nri:.3f}")

labels = ["ΔAUC (B−A)","ΔPR-AUC","ΔBrier","ΔSens","ΔSpec","ΔPPV","ΔNPV"]

def diff_metrics(y, pA, pB, w, thr):
    d_auc = roc_auc_score(y, pB, sample_weight=w) - roc_auc_score(y, pA, sample_weight=w)
    d_ap  = average_precision_score(y, pB, sample_weight=w) - average_precision_score(y, pA, sample_weight=w)
    d_br  = brier_score_loss(y, pB, sample_weight=w) - brier_score_loss(y, pA, sample_weight=w)
    sA, cA, ppvA, npvA = confusion_at(y, pA, w, thr)
    sB, cB, ppvB, npvB = confusion_at(y, pB, w, thr)
    return np.array([d_auc, d_ap, d_br, (sB-sA), (cB-cA), (ppvB-ppvA), (npvB-npvA)], dtype=float)

def psu_bootstrap_within_strata(psu_ids, strata_ids, B=1000, seed=42):
    rng = np.random.RandomState(seed)
    u_strata = np.unique(strata_ids)
    n = len(psu_ids)
    for _ in range(B):
        mult = np.zeros(n, dtype=float)
        for s in u_strata:
            m = (strata_ids == s)
            psu_s = np.unique(psu_ids[m])
            draw = rng.choice(psu_s, size=len(psu_s), replace=True)
            counts = pd.Series(draw).value_counts()
            mult[m] = [counts.get(pid, 0) for pid in psu_ids[m]]
        yield mult

def psu_bootstrap_unstratified(psu_ids, B=1000, seed=42):
    rng = np.random.RandomState(seed)
    u = np.unique(psu_ids)
    for _ in range(B):
        draw = rng.choice(u, size=len(u), replace=True)
        counts = pd.Series(draw).value_counts()
        yield np.array([counts.get(pid, 0) for pid in psu_ids], dtype=float)

if (psu is not None) and (stra is not None):
    n_per_stratum = M.groupby("SDMVSTRA")["SDMVPSU"].nunique()
    strat_ok = bool((n_per_stratum >= 2).all())
    if strat_ok:
        gen = psu_bootstrap_within_strata(psu, stra, B=1000, seed=42)
    else:
        print("\nWARNING: Test set has 'lonely PSUs' (some strata with only 1 PSU).")
        print("Using UNSTRATIFIED PSU bootstrap for CIs (design-inconsistent).")
        gen = psu_bootstrap_unstratified(psu, B=1000, seed=42)

    diffs_collect = []
    reps_used = 0
    for mult in gen:
        wb = w * mult
        if wb.sum() <= 0:
            continue
        if (np.sum(wb[y==1]) == 0) or (np.sum(wb[y==0]) == 0):
            continue
        diffs_collect.append(diff_metrics(y, pA, pB, wb, THRESH))
        reps_used += 1

    diffs_collect = np.asarray(diffs_collect, float)
    print("\n=== PSU-bootstrap 95% CIs for metric differences (B − A) — logit-percentile ===")
    print(f"Bootstrap replicates used: {reps_used}")
    point = diff_metrics(y, pA, pB, w, THRESH)

    for i, lab in enumerate(labels):
        lo, hi = ci_logit_percentile_diff01(diffs_collect[:, i])  
        print(f"{lab}: point={point[i]:.4f} | 95% CI [{lo:.4f}, {hi:.4f}]")
else:
    print("\nDesign IDs missing (SDMVPSU/SDMVSTRA). Skipping design-based CIs.")

plt.figure(figsize=(6,6))
plt.scatter(pA, pB, s=8, alpha=0.25)
plt.plot([0,1],[0,1], "--", alpha=0.5)
plt.xlabel("p(A)"); plt.ylabel("p(B)"); plt.title("Predicted probabilities: Model A vs B")
plt.tight_layout(); plt.savefig("compare_scatter_pA_pB.png", dpi=150)
-
summary_df = pd.DataFrame({
    "metric": ["AUC","PR-AUC","Brier"],
    "Model_A": [aucA, apA, bA],
    "Model_B": [aucB, apB, bB],
    "Diff_B_minus_A": [aucB-aucA, apB-apA, bB-bA]
})
summary_df.to_csv("compare_summary.csv", index=False)

print("\nSaved: compare_roc.png, compare_pr.png, compare_calibration.png, compare_scatter_pA_pB.png, compare_summary.csv")
plt.show()
