# Stage 11 — Evaluation & Risk Communication 

In [None]:

import numpy as np, pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (8,4)
plt.rcParams["axes.grid"] = True
import sys
ROOT = Path.cwd().parents[0] if (Path.cwd().name == "notebooks") else Path.cwd()
sys.path.append(str(ROOT / "src"))
from evaluation import (
    generate_synthetic_data, simple_model_fit_predict,
    rmse, mae, bootstrap_metric_ci, scenario_impute,
    scenario_drop_missing, subgroup_metric_by
)
SEED = 123
np.random.seed(SEED)


## 1) Load data

In [None]:

data_path = (ROOT / "data" / "data_stage11_eval_risk.csv")
if data_path.exists():
    df = pd.read_csv(data_path)
else:
    df = generate_synthetic_data(n=800, missing_rate=0.08, seed=42)
    data_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(data_path, index=False)
display(df.head())
print("Shape:", df.shape, "| Missing:", df.isna().sum().to_dict())


## 2) Baseline fit & metric

In [None]:

df_mean = scenario_impute(df, strategy="mean")
X = df_mean[["x1","x2","x3"]]
y = df_mean["y"]
model, pred = simple_model_fit_predict(X, y)
base_rmse = rmse(y, pred)
base_mae = mae(y, pred)
print("Baseline RMSE:", round(base_rmse, 4), "| MAE:", round(base_mae, 4))

resid = y - pred
plt.scatter(pred, resid, s=12, alpha=0.7)
plt.axhline(0, linestyle="--")
plt.title("Residuals vs Prediction (Baseline)")
plt.xlabel("Prediction"); plt.ylabel("Residual")
plt.show()


## 3) Bootstrap CI for metric (RMSE)

In [None]:

ci, point_est, dist = bootstrap_metric_ci(X, y, metric_fn=rmse, B=600, seed=SEED, return_dist=True)
print("Bootstrap RMSE point estimate:", round(point_est, 4))
print("95% CI:", (round(ci[0],4), round(ci[1],4)))

plt.hist(dist, bins=30, alpha=0.8)
plt.axvline(ci[0], linestyle="--")
plt.axvline(ci[1], linestyle="--")
plt.title("Bootstrap distribution of RMSE")
plt.xlabel("RMSE"); plt.ylabel("Frequency")
plt.show()


## 4) Scenario comparisons (≥ 2)

In [None]:

df_median = scenario_impute(df, strategy="median")
XA, yA = df_median[["x1","x2","x3"]], df_median["y"]
_, pA = simple_model_fit_predict(XA, yA)
rmse_A, mae_A = rmse(yA, pA), mae(yA, pA)

df_drop = scenario_drop_missing(df)
XB, yB = df_drop[["x1","x2","x3"]], df_drop["y"]
_, pB = simple_model_fit_predict(XB, yB)
rmse_B, mae_B = rmse(yB, pB), mae(yB, pB)

comp = pd.DataFrame({
    "scenario": ["Baseline(mean)", "Median", "DropMissing"],
    "RMSE": [float(base_rmse), float(rmse_A), float(rmse_B)],
    "MAE": [float(base_mae), float(mae_A), float(mae_B)],
    "n_obs": [len(df_mean), len(df_median), len(df_drop)]
})
display(comp)

xs = np.arange(len(comp))
width = 0.35
plt.bar(xs - width/2, comp["RMSE"], width, label="RMSE")
plt.bar(xs + width/2, comp["MAE"], width, label="MAE")
plt.xticks(xs, comp["scenario"])
plt.title("Scenario comparison (consistent axes)")
plt.ylabel("Error")
plt.legend()
plt.show()


## 5) Subgroup diagnostics (by `segment`)

In [None]:

dfb = df_mean.assign(pred=pred, resid=y - pred)
sub = subgroup_metric_by(dfb, metric_fn=rmse, subgroup_col="segment")
display(sub)

plt.bar(sub["segment"].astype(str), sub["rmse"])
plt.title("RMSE by segment")
plt.xlabel("segment"); plt.ylabel("RMSE")
plt.show()

for s, g in dfb.groupby("segment"):
    plt.scatter(g["pred"], g["resid"], s=10, alpha=0.5, label=f"seg {s}")
plt.axhline(0, linestyle="--")
plt.title("Residuals by segment")
plt.xlabel("Prediction"); plt.ylabel("Residual")
plt.legend()
plt.show()



## 6) Stakeholder-facing summary (≤ 1 page)

**Assumptions**: linear relation; baseline mean imputation; stable generating process.  
**Uncertainty**: 95% CI on RMSE from bootstrap (B=600).  
**Sensitivity**: median imputation close to baseline; dropping rows reduces n and can bias if missingness ≠ MCAR.  
**Subgroups**: Segment D shows higher error (noisier).  

**Holds if** missing-rate ≲10% and segment mix stable.  
**Sensitive to** missing-rate >15% or surge in Segment D; consider robust/segment-aware models next.
