
# 🌪️ Storm Damage Prediction — End‑to‑End Explainable Pipeline (Final)



This notebook builds a **production‑style ML pipeline** that predicts **Property** and **Crop** damages from a natural‑hazard dataset that includes **structured** fields and **unstructured narratives**.

### What you get
- ✅ **Robust data cleaning** (drop ids, unify date/time, duration in seconds)
- ✅ **Geo feature engineering** (mean lat/lon, sin/cos, azimuths)
- ✅ **Text features from a pretrained LLM** (`SentenceTransformer: all-MiniLM-L6-v2`)
- ✅ **Models**: Ridge, XGBoost, TabTransformer (PyTorch) — multi‑output regression
- ✅ **Optuna hyperparameter tuning** (Random + Bayesian sampling)
- ✅ **Top‑20 feature selection** via permutation importance (on preprocessed space)
- ✅ **Explainability**: SHAP global & local, LLM natural‑language explanations
- ✅ **Executive Summary** (LLM), **Dual‑Target Explainability PDF report**
- ✅ All artifacts saved to `./artifacts_pipeline`

> **Note**: This notebook expects the CSV file to be in the same folder:
> `StormEvents_details-ftp_v1.0_d2013_c20250520.csv`


In [None]:

# === 1) Config & Imports ===
import os, re, json, math, gc, warnings
import numpy as np
import pandas as pd

from pathlib import Path
from typing import Optional, Tuple, List

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.inspection import permutation_importance

import joblib

warnings.filterwarnings("ignore")

# Text embeddings
from sentence_transformers import SentenceTransformer

# XGBoost
import xgboost as xgb
from xgboost import XGBRegressor

# Optuna
import optuna

# Optional TabTransformer (rtdl). If missing, we fallback to MLPRegressor.
USE_RTDL = True
try:
    import torch
    import torch.nn as nn
    import torch.optim as optim
    import rtdl
except Exception as e:
    print("ℹ️ rtdl/torch not available; will fallback to sklearn MLPRegressor for tabular deep model.")
    USE_RTDL = False
    from sklearn.neural_network import MLPRegressor

# SHAP & plotting (for report section later)
import shap
import matplotlib.pyplot as plt

# OpenAI for LLM explanations (optional)
try:
    from openai import OpenAI
    OPENAI_READY = bool(os.getenv("OPENAI_API_KEY"))
except Exception:
    OPENAI_READY = False

# Reportlab for PDF
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image
from reportlab.lib import colors

# === Paths ===
CSV_PATH = "StormEvents_details-ftp_v1.0_d2013_c20250520.csv"  # in same folder as notebook
OUTDIR   = "./artifacts_pipeline"
os.makedirs(OUTDIR, exist_ok=True)

# === Text model ===
EMBED_MODEL_NAME = "all-MiniLM-L6-v2"  # pretrained SentenceTransformer (LLM encoder)

print("Config OK. Output dir:", OUTDIR)


In [None]:

# === 2) Helpers ===

def parse_damage(v):
    if pd.isna(v):
        return np.nan
    s = str(v).strip().upper().replace(",", "")
    if s in ("", "NA", "N/A", "NONE"):
        return np.nan
    mult = 1.0
    if s.endswith("K"):
        mult = 1e3; s = s[:-1]
    elif s.endswith("M"):
        mult = 1e6; s = s[:-1]
    elif s.endswith("B"):
        mult = 1e9; s = s[:-1]
    try:
        return float(s) * mult
    except:
        return np.nan

def to_dt(series):
    try:
        return pd.to_datetime(series, errors="coerce", infer_datetime_format=True)
    except:
        return pd.to_datetime(series, errors="coerce")

def from_ymd_time_cols(df, side):
    ym = df.get(f"{side.upper()}_YEARMONTH", pd.Series(index=df.index))
    day = df.get(f"{side.upper()}_DAY", pd.Series(index=df.index))
    hhmm = df.get(f"{side.upper()}_TIME", pd.Series(index=df.index))
    if ym.isna().all() or day.isna().all() or hhmm.isna().all():
        return None
    ym = pd.to_numeric(ym, errors="coerce").astype("Int64")
    day = pd.to_numeric(day, errors="coerce").astype("Int64")
    hhmm = pd.to_numeric(hhmm, errors="coerce").astype("Int64")
    def make_ts(row):
        if pd.isna(row[0]) or pd.isna(row[1]) or pd.isna(row[2]):
            return pd.NaT
        y = int(row[0] // 100); m = int(row[0] % 100); d = int(row[1]); t = int(row[2])
        HH = t // 100; MM = t % 100
        try: return pd.Timestamp(year=y, month=m, day=d, hour=HH, minute=MM)
        except: return pd.NaT
    return pd.Series(list(map(make_ts, zip(ym, day, hhmm))), index=df.index)

def duration_seconds(begin_ts, end_ts):
    dt = (end_ts - begin_ts).dt.total_seconds()
    return pd.to_numeric(dt, errors="coerce")

AZIMUTH_MAP = {"N":0,"NNE":22.5,"NE":45,"ENE":67.5,"E":90,"ESE":112.5,"SE":135,"SSE":157.5,
               "S":180,"SSW":202.5,"SW":225,"WSW":247.5,"W":270,"WNW":292.5,"NW":315,"NNW":337.5}

import math
def metrics_frame(y_true, y_pred, labels):
    out = {}
    for i, name in enumerate(labels):
        mae = mean_absolute_error(y_true[:, i], y_pred[:, i])
        rmse = math.sqrt(mean_squared_error(y_true[:, i], y_pred[:, i]))
        r2 = r2_score(y_true[:, i], y_pred[:, i])
        out[name] = dict(MAE=mae, RMSE=rmse, R2=r2)
    return pd.DataFrame(out).T

print("Helper functions ready.")


In [None]:

# === 3) Load & Clean ===
assert os.path.exists(CSV_PATH), f"CSV not found at {CSV_PATH}. Place it next to the notebook."

df = pd.read_csv(CSV_PATH, low_memory=False, encoding="utf-8", na_values=["","NA","NaN","N/A"])
print("Raw shape:", df.shape)

df.columns = [c.strip().upper().replace(" ", "_") for c in df.columns]

for col in ["EPISODE_ID","EVENT_ID","DATA_SOURCE"]:
    if col in df.columns:
        df.drop(columns=col, inplace=True, errors="ignore")

prop_col = "DAMAGE_PROPERTY" if "DAMAGE_PROPERTY" in df.columns else None
crop_col = "DAMAGE_CROPS"    if "DAMAGE_CROPS"    in df.columns else None
assert prop_col and crop_col, "Missing target columns DAMAGE_PROPERTY/DAMAGE_CROPS."

df["Y_PROP"] = df[prop_col].apply(parse_damage)
df["Y_CROP"] = df[crop_col].apply(parse_damage)

df["EPISODE_NARRATIVE"] = df.get("EPISODE_NARRATIVE", "").fillna("")
df["EVENT_NARRATIVE"]   = df.get("EVENT_NARRATIVE", "").fillna("")

begin_ts = None; end_ts = None
if "BEGIN_DATE_TIME" in df.columns and "END_DATE_TIME" in df.columns:
    begin_ts = to_dt(df["BEGIN_DATE_TIME"]); end_ts = to_dt(df["END_DATE_TIME"])
else:
    begin_ts = from_ymd_time_cols(df, "BEGIN"); end_ts = from_ymd_time_cols(df, "END")

df["DURATION_SECONDS"] = duration_seconds(begin_ts, end_ts)
dur_mean = df["DURATION_SECONDS"].mean(skipna=True); dur_std = df["DURATION_SECONDS"].std(skipna=True) or 1.0
df["DURATION_SECONDS_STD"] = (df["DURATION_SECONDS"] - dur_mean) / dur_std

for c in ["BEGIN_LAT","BEGIN_LON","END_LAT","END_LON"]:
    if c in df.columns: df[c] = pd.to_numeric(df[c], errors="coerce")

df["LAT_MEAN"] = df[["BEGIN_LAT","END_LAT"]].mean(axis=1, skipna=True) if {"BEGIN_LAT","END_LAT"}.issubset(df.columns) else df.get("BEGIN_LAT", np.nan)
df["LON_MEAN"] = df[["BEGIN_LON","END_LON"]].mean(axis=1, skipna=True) if {"BEGIN_LON","END_LON"}.issubset(df.columns) else df.get("BEGIN_LON", np.nan)
df["LAT_SIN"] = np.sin(np.deg2rad(df["LAT_MEAN"])); df["LAT_COS"] = np.cos(np.deg2rad(df["LAT_MEAN"]))
df["LON_SIN"] = np.sin(np.deg2rad(df["LON_MEAN"])); df["LON_COS"] = np.cos(np.deg2rad(df["LON_MEAN"]))

for side in ["BEGIN","END"]:
    azc = f"{side}_AZIMUTH"
    if azc in df.columns: df[f"{azc}_DEG"] = df[azc].astype(str).str.upper().map(AZIMUTH_MAP)

if {"BEGIN_AZIMUTH_DEG","END_AZIMUTH_DEG"}.issubset(df.columns):
    df["AZIMUTH_DEG_MEAN"] = df[["BEGIN_AZIMUTH_DEG","END_AZIMUTH_DEG"]].mean(axis=1)
    rad = np.deg2rad(df["AZIMUTH_DEG_MEAN"]); df["AZIMUTH_SIN"] = np.sin(rad); df["AZIMUTH_COS"] = np.cos(rad)

if {"BEGIN_RANGE","END_RANGE"}.issubset(df.columns):
    df["RANGE_MEAN"] = pd.to_numeric(df["BEGIN_RANGE"], errors="coerce").add(pd.to_numeric(df["END_RANGE"], errors="coerce")) / 2.0

df["LOCATION_NAME"] = df.get("BEGIN_LOCATION", pd.Series(index=df.index)).fillna(df.get("END_LOCATION",""))

for c in ["BEGIN_LAT","BEGIN_LON","END_LAT","END_LON","BEGIN_AZIMUTH","END_AZIMUTH","BEGIN_LOCATION","END_LOCATION","BEGIN_RANGE","END_RANGE"]:
    if c in df.columns: df.drop(columns=c, inplace=True, errors="ignore")

if prop_col in df.columns: df.drop(columns=prop_col, inplace=True, errors="ignore")
if crop_col in df.columns: df.drop(columns=crop_col, inplace=True, errors="ignore")

snapshot = {"rows": int(len(df)), "cols": int(len(df.columns)), "missing_Y_PROP": int(df["Y_PROP"].isna().sum()), "missing_Y_CROP": int(df["Y_CROP"].isna().sum())}
json.dump(snapshot, open(os.path.join(OUTDIR,"data_snapshot.json"),"w"), indent=2)

print("Clean shape:", df.shape)
df.head(3)


In [None]:

# === 4) Feature sets & splits ===
text_cols = [c for c in ["EPISODE_NARRATIVE","EVENT_NARRATIVE"] if c in df.columns]
cat_cols  = [c for c in ["STATE","EVENT_TYPE","CZ_TYPE","CZ_NAME","LOCATION_NAME"] if c in df.columns]
num_cols  = [c for c in [
    "DURATION_SECONDS_STD",
    "LAT_MEAN","LON_MEAN","LAT_SIN","LAT_COS","LON_SIN","LON_COS",
    "AZIMUTH_DEG_MEAN","AZIMUTH_SIN","AZIMUTH_COS",
    "INJURIES_DIRECT","INJURIES_INDIRECT","DEATHS_DIRECT","DEATHS_INDIRECT"
] if c in df.columns]

print("text_cols:", text_cols)
print("cat_cols :", cat_cols)
print("num_cols :", len(num_cols))

df_model = df.dropna(subset=["Y_PROP","Y_CROP"]).copy()
y = np.column_stack([df_model["Y_PROP"].values, df_model["Y_CROP"].values]); y_log = np.log1p(y)
X = df_model[num_cols + cat_cols + text_cols].copy()

X_train, X_temp, y_train_log, y_temp_log = train_test_split(X, y_log, test_size=0.30, random_state=42)
X_valid, X_test, y_valid_log, y_test_log = train_test_split(X_temp, y_temp_log, test_size=0.50, random_state=42)
print("Sizes -> Train:", len(X_train), "| Valid:", len(X_valid), "| Test:", len(X_test))


In [None]:

# === 5) Preprocessing ===
class SBERTEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, model_name="all-MiniLM-L6-v2", device=None, batch_size=64):
        self.model_name = model_name; self.device = device; self.batch_size = batch_size
        self._model = None
    def _lazy_model(self):
        if self._model is None:
            self._model = SentenceTransformer(self.model_name, device=self.device if self.device else None)
        return self._model
    def fit(self, X, y=None): return self
    def transform(self, X):
        m = self._lazy_model()
        if isinstance(X, pd.DataFrame): texts = X.iloc[:,0].astype(str).fillna("").tolist()
        elif isinstance(X, pd.Series): texts = X.astype(str).fillna("").tolist()
        else: texts = pd.Series(X).astype(str).fillna("").tolist()
        return m.encode(texts, batch_size=self.batch_size, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=False)

num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")),("scaler", StandardScaler())])
cat_pipe = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))])

transformers = []
if num_cols: transformers.append(("num", num_pipe, num_cols))
if cat_cols: transformers.append(("cat", cat_pipe, cat_cols))
for tc in text_cols:
    transformers.append((f"text_{tc.lower()}", Pipeline([("sbert", SBERTEncoder())]), [tc]))

preprocess = ColumnTransformer(transformers, remainder="drop", n_jobs=None)
print("Preprocessing ready.")


In [None]:

# === 6) Models ===
def make_ridge(alpha=1.0):
    return MultiOutputRegressor(Ridge(alpha=alpha, random_state=42))

def make_xgb(n_estimators=300, max_depth=6, learning_rate=0.05):
    est = XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate,
                       subsample=0.9, colsample_bytree=0.9, random_state=42, n_jobs=-1, tree_method='hist', reg_lambda=1.0)
    return MultiOutputRegressor(est)

USE_RTDL_FLAG = USE_RTDL
if USE_RTDL_FLAG:
    import torch, torch.nn as nn, torch.optim as optim, rtdl

class TorchTabTransformerRegressor(BaseEstimator):
    def __init__(self, epochs=10, lr=1e-3, batch_size=256, device=None):
        self.epochs = epochs; self.lr = lr; self.batch_size = batch_size
        self.device = device or ("cuda" if (USE_RTDL_FLAG and torch.cuda.is_available()) else "cpu")
        self.model = None
    def fit(self, X, y):
        X = np.array(X); y = np.array(y, dtype=np.float32)
        n, d = X.shape; ydim = y.shape[1] if y.ndim > 1 else 1
        model = rtdl.FTTransformer.make_default(n_num_features=d, cat_cardinalities=None, last_layer_query_idx=[-1],
                                                d_token=192, n_blocks=2, attention_dropout=0.2, ff_dropout=0.2, d_out=ydim).to(self.device)
        opt = optim.Adam(model.parameters(), lr=self.lr); loss_fn = nn.MSELoss()
        dl = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.tensor(X, dtype=torch.float32),
                                                                        torch.tensor(y, dtype=torch.float32)),
                                         batch_size=self.batch_size, shuffle=True)
        model.train()
        for _ in range(self.epochs):
            for xb, yb in dl:
                xb = xb.to(self.device); yb = yb.to(self.device)
                opt.zero_grad(); pred = model(xb); loss = loss_fn(pred, yb)
                loss.backward(); opt.step()
        self.model = model; return self
    def predict(self, X):
        X = np.array(X); self.model.eval()
        with torch.no_grad():
            xb = torch.tensor(X, dtype=torch.float32, device=self.device)
            return self.model(xb).cpu().numpy()

def make_tabtr(epochs=10, lr=1e-3):
    if USE_RTDL_FLAG:
        return TorchTabTransformerRegressor(epochs=epochs, lr=lr)
    else:
        from sklearn.neural_network import MLPRegressor
        return MultiOutputRegressor(MLPRegressor(hidden_layer_sizes=(256,128), activation='relu', random_state=42, max_iter=200))

print("Models ready.")


In [None]:

# === 7) First-pass evaluation ===
def eval_on_split(pipe, Xtr, ytr_log, Xev, yev_log, label):
    pipe.fit(Xtr, ytr_log)
    pred_log = pipe.predict(Xev)
    yhat = np.expm1(pred_log); ytrue = np.expm1(yev_log)
    mf = metrics_frame(ytrue, yhat, ["damage_property","damage_crops"])
    print(f"\\n📊 {label} results:\\n{mf}")
    return mf, yhat

pipes = {
    "ridge": Pipeline([("prep", preprocess), ("reg", make_ridge(alpha=1.0))]),
    "xgb":   Pipeline([("prep", preprocess), ("reg", make_xgb())]),
    "tabtr": Pipeline([("prep", preprocess), ("reg", make_tabtr(epochs=10, lr=1e-3))])
}
val_results = {}
for name, pipe in pipes.items():
    mf, _ = eval_on_split(pipe, X_train, y_train_log, X_valid, y_valid_log, f"{name} (validation)")
    val_results[name] = mf
json.dump({k: v.to_dict(orient="index") for k, v in val_results.items()}, open(os.path.join(OUTDIR,"validation_metrics_first_pass.json"),"w"), indent=2)
print("✓ Saved first-pass metrics.")


In [None]:

# === 8) Optuna tuning ===
def rmse_sum_pred(y_true_log, y_pred_log):
    yt = np.expm1(y_true_log); yp = np.expm1(y_pred_log)
    return math.sqrt(mean_squared_error(yt[:,0], yp[:,0])) + math.sqrt(mean_squared_error(yt[:,1], yp[:,1]))

def objective_factory(model_name):
    def objective(trial):
        if model_name == "ridge":
            alpha = trial.suggest_float("alpha", 0.01, 200.0, log=True)
            model = make_ridge(alpha=alpha)
        elif model_name == "xgb":
            n_estimators = trial.suggest_int("n_estimators", 200, 800, step=100)
            max_depth    = trial.suggest_int("max_depth", 3, 9, step=1)
            learning_rate= trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
            model = make_xgb(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate)
        elif model_name == "tabtr":
            epochs = trial.suggest_int("epochs", 8, 20, step=4)
            lr     = trial.suggest_float("lr", 5e-4, 2e-3, log=True)
            model  = make_tabtr(epochs=epochs, lr=lr)
        pipe = Pipeline([("prep", preprocess), ("reg", model)])
        pipe.fit(X_train, y_train_log)
        pred = pipe.predict(X_valid)
        return rmse_sum_pred(y_valid_log, pred)
    return objective

best_params = {}
for name in ["ridge","xgb","tabtr"]:
    study = optuna.create_study(direction="minimize", study_name=f"{name}_tuning")
    study.optimize(objective_factory(name), n_trials=15, show_progress_bar=False)
    best_params[name] = study.best_params
    print(f"Best {name} params:", study.best_params, "-> score:", study.best_value)

pd.DataFrame(best_params).to_csv(os.path.join(OUTDIR,"best_hyperparams.csv"))
json.dump(best_params, open(os.path.join(OUTDIR,"best_hyperparams.json"),"w"), indent=2)
print("✓ Saved best_hyperparams.csv/.json")


In [None]:

# === 9) Winner + Permutation Importance + Top‑20 + Test ===
tuned = json.load(open(os.path.join(OUTDIR,"best_hyperparams.json")))
pipes_tuned = {
    "ridge": Pipeline([("prep", preprocess), ("reg", make_ridge(**tuned.get("ridge", {"alpha":1.0}))) ]),
    "xgb":   Pipeline([("prep", preprocess), ("reg", make_xgb(**tuned.get("xgb", {}))) ]),
    "tabtr": Pipeline([("prep", preprocess), ("reg", make_tabtr(**tuned.get("tabtr", {}))) ])
}
val_tuned = {}
for name, pipe in pipes_tuned.items():
    mf, _ = eval_on_split(pipe, X_train, y_train_log, X_valid, y_valid_log, f"{name} tuned (validation)")
    val_tuned[name] = mf

def rmse_sum_from_df(dfm): return float(dfm["RMSE"].sum())
winner = min(val_tuned.keys(), key=lambda n: rmse_sum_from_df(val_tuned[n]))
print("🏆 Winner after tuning:", winner)

best_pipe = pipes_tuned[winner]; best_pipe.fit(X_train, y_train_log)
Xt_train = best_pipe.named_steps["prep"].transform(X_train)
Xt_valid = best_pipe.named_steps["prep"].transform(X_valid)
final_est = best_pipe.named_steps["reg"]

def train_single_target_estimator(est, Xt, y_log, i):
    if isinstance(est, MultiOutputRegressor):
        base = clone(est.estimators_[0])
    else:
        base = clone(est)
    base.fit(Xt, y_log[:, i]); return base

m0 = train_single_target_estimator(final_est, Xt_train, y_train_log, 0)
m1 = train_single_target_estimator(final_est, Xt_train, y_train_log, 1)
perm1 = permutation_importance(m0, Xt_valid, y_valid_log[:,0], n_repeats=5, random_state=42, n_jobs=-1)
perm2 = permutation_importance(m1, Xt_valid, y_valid_log[:,1], n_repeats=5, random_state=42, n_jobs=-1)
imp_mean = (perm1.importances_mean + perm2.importances_mean) / 2.0

feat_names=[]
for name, trans, cols in best_pipe.named_steps["prep"].transformers_:
    if name=="num": feat_names.extend(cols)
    elif name=="cat":
        ohe = trans.named_steps.get("onehot", None)
        if ohe is not None:
            try: fn = ohe.get_feature_names_out(cols)
            except: fn = ohe.get_feature_names_out()
            feat_names.extend(fn.tolist())
    elif name.startswith("text_"):
        dummy = trans.transform(pd.DataFrame({cols[0]: ["sample"]})); dim = dummy.shape[1]
        feat_names.extend([f"{name}_emb_{i}" for i in range(dim)])
if len(feat_names)!=Xt_valid.shape[1]: feat_names=[f"f_{i}" for i in range(Xt_valid.shape[1])]

rank=np.argsort(imp_mean)[::-1]; top_k=20; top_idx=rank[:top_k]
importances = pd.DataFrame({"feature":[feat_names[i] for i in top_idx], "importance":[float(imp_mean[i]) for i in top_idx]})
importances.to_csv(os.path.join(OUTDIR,"top20_features.csv"), index=False); print(importances.head())

def select_columns_by_idx(Xm): return Xm[:, top_idx]
feature_selector = FunctionTransformer(select_columns_by_idx, validate=False)

pipe_top20 = Pipeline([("prep", preprocess), ("select", feature_selector), ("reg", final_est)]); pipe_top20.fit(X_train, y_train_log)

def test_report(pipe, tag):
    yhat = np.expm1(pipe.predict(X_test)); ytrue = np.expm1(y_test_log)
    mf = metrics_frame(ytrue, yhat, ["damage_property","damage_crops"])
    print(f"\\n=== TEST REPORT: {tag} ===\\n", mf); return mf

mf_top20_best = test_report(pipe_top20, f"Top-20 winner = {winner}")
json.dump({"top20_best": mf_top20_best.to_dict(orient="index")}, open(os.path.join(OUTDIR,"test_metrics.json"),"w"), indent=2)
joblib.dump({"pipeline": pipe_top20, "feature_space":"top20", "top20_indices": top_idx.tolist(), "text_model": "all-MiniLM-L6-v2", "targets":["Y_PROP","Y_CROP"]}, os.path.join(OUTDIR,"damage_pipeline_best.joblib"))
print("✓ Saved final model to damage_pipeline_best.joblib")


In [None]:

# === 10) SHAP Explainability ===
bundle = joblib.load(os.path.join(OUTDIR, "damage_pipeline_best.joblib")); best_pipe = bundle["pipeline"]

X_sample = X_valid.sample(min(200, len(X_valid)), random_state=42)
X_prep = best_pipe.named_steps["prep"].transform(X_sample)
final_est = best_pipe.named_steps["reg"]

explainer = None; shap_values = None; feat_names=[]
for name, trans, cols in best_pipe.named_steps["prep"].transformers_:
    if name=="num": feat_names.extend(cols)
    elif name=="cat":
        ohe = trans.named_steps.get("onehot", None)
        if ohe is not None:
            try: fn = ohe.get_feature_names_out(cols)
            except: fn = ohe.get_feature_names_out()
            feat_names.extend(fn.tolist())
    elif name.startswith("text_"):
        dummy = trans.transform(pd.DataFrame({cols[0]: ["sample"]})); dim = dummy.shape[1]
        feat_names.extend([f"{name}_emb_{i}" for i in range(dim)])

try:
    if isinstance(final_est, MultiOutputRegressor):
        expl0 = shap.Explainer(final_est.estimators_[0], X_prep); sv0 = expl0(X_prep)
        expl1 = shap.Explainer(final_est.estimators_[1], X_prep); sv1 = expl1(X_prep)
        shap_values = np.stack([sv0.values, sv1.values], axis=2)  # [n,f,2]
    else:
        expl = shap.Explainer(final_est, X_prep); sv = expl(X_prep)
        shap_values = sv
except Exception as e:
    print("⚠️ SHAP failed:", e)

if shap_values is not None:
    if isinstance(shap_values, np.ndarray) and shap_values.ndim==3:
        mean_abs = np.mean(np.abs(shap_values), axis=(0,2))
    else:
        vals = shap_values.values if hasattr(shap_values, "values") else None
        mean_abs = np.mean(np.abs(vals), axis=0) if vals is not None else None

    if mean_abs is not None:
        shap_df = pd.DataFrame({"feature": feat_names, "mean_abs_shap": mean_abs}).sort_values("mean_abs_shap", ascending=False)
        shap_df.to_csv(os.path.join(OUTDIR, "shap_importance.csv"), index=False)

        plt.figure(figsize=(8,5))
        idx = np.argsort(mean_abs)[-20:][::-1]
        plt.barh([feat_names[i] for i in idx][::-1], mean_abs[idx][::-1])
        plt.xlabel("Mean |SHAP|"); plt.title("Top 20 Global Features (SHAP)")
        plt.tight_layout(); plt.savefig(os.path.join(OUTDIR,"shap_barplot_global.png"), dpi=300); plt.close()

        if isinstance(shap_values, np.ndarray) and shap_values.ndim==3:
            for ti, tname in enumerate(["damage_property","damage_crops"]):
                try:
                    plt.figure(figsize=(8,5))
                    shap.summary_plot(shap_values[...,ti], X_prep, feature_names=feat_names, show=False)
                    plt.title(f"SHAP Summary — {tname}")
                    plt.savefig(os.path.join(OUTDIR, f"shap_beeswarm_{tname}.png"), dpi=300, bbox_inches="tight")
                    plt.close()
                except Exception as e:
                    print("Beeswarm failed:", e)
        else:
            try:
                plt.figure(figsize=(8,5))
                shap.summary_plot(shap_values, X_prep, feature_names=feat_names, show=False)
                plt.title("SHAP Summary")
                plt.savefig(os.path.join(OUTDIR, "shap_beeswarm_generic.png"), dpi=300, bbox_inches="tight")
                plt.close()
            except Exception as e:
                print("Beeswarm generic failed:", e)

print("SHAP artifacts done.")


In [None]:

# === 11) LLM explanations + combined CSV ===
from IPython.display import display, Markdown

def llm_explain_shap_for_sample(top_pairs, model_name="gpt-4-turbo"):
    if not OPENAI_READY:
        parts = [f"{f} ({v:+.3f})" for f, v in top_pairs]
        return ("Model predicts higher/lower damages primarily due to: " +
                ", ".join(parts) +
                ". Text embeddings reflect severity/impact cues; geospatial and duration features modulate exposure.")
    try:
        client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        prompt = f\"\"\"Explain, in 3–5 sentences, why the following features and SHAP values
increase/decrease predicted storm damages (property/crops). Be concise.
Features: {top_pairs}\"\"\"
        resp = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role":"system","content":"You are a precise data scientist writing model explanations."},
                {"role":"user","content":prompt}
            ],
            temperature=0.2
        )
        return resp.choices[0].message.content.strip()
    except Exception as e:
        return f"LLM unavailable: {e}"

combined_rows = []
try:
    if isinstance(shap_values, np.ndarray) and shap_values.ndim == 3:
        mean_abs_sample = np.mean(np.abs(shap_values), axis=2)  # [n, f]
        for i in range(min(10, mean_abs_sample.shape[0])):
            order = np.argsort(-mean_abs_sample[i])[:5]
            pairs_mean = [(feat_names[j], float(np.mean(shap_values[i, j, :]))) for j in order]
            llm_text = llm_explain_shap_for_sample(pairs_mean)
            for j in order:
                combined_rows.append({"sample_id": i, "feature": feat_names[j], "shap_value": float(np.mean(shap_values[i, j, :])), "llm_summary": llm_text})
    else:
        sv = shap_values.values if hasattr(shap_values, "values") else None
        if sv is not None:
            for i in range(min(10, sv.shape[0])):
                order = np.argsort(-np.abs(sv[i]))[:5]
                pairs = [(feat_names[j], float(sv[i, j])) for j in order]
                llm_text = llm_explain_shap_for_sample(pairs)
                for j in order:
                    combined_rows.append({"sample_id": i, "feature": feat_names[j], "shap_value": float(sv[i, j]), "llm_summary": llm_text})
except Exception as e:
    print("⚠️ Unable to build combined SHAP+LLM rows:", e)

combined_df = pd.DataFrame(combined_rows)
combined_path = os.path.join(OUTDIR, "combined_shap_llm_report.csv")
combined_df.to_csv(combined_path, index=False)
print("✓ Saved combined SHAP + LLM report ->", combined_path)

for sid in sorted(combined_df["sample_id"].unique())[:3]:
    sub = combined_df[combined_df["sample_id"] == sid]
    md = f"### 🌩️ Event {sid} — Top Feature Drivers\\n"
    for _, row in sub.iterrows():
        sign = "⬆️" if row["shap_value"] > 0 else "⬇️"
        md += f"- **{row['feature']}** ({sign}{abs(row['shap_value']):.4f})\\n"
    md += f"\\n**LLM Explanation:** {sub.iloc[0]['llm_summary']}\\n"
    display(Markdown(md))


In [None]:

# === 12) Executive Summary (LLM or fallback) ===
from datetime import datetime
from textwrap import shorten

metrics_path = os.path.join(OUTDIR, "test_metrics.json")
shap_path    = os.path.join(OUTDIR, "shap_importance.csv")
local_path   = os.path.join(OUTDIR, "combined_shap_llm_report.csv")
summary_txt  = os.path.join(OUTDIR, "executive_summary.txt")
snapshot_json= os.path.join(OUTDIR, "data_snapshot.json")

metrics = json.load(open(metrics_path)) if os.path.exists(metrics_path) else None
shap_top = pd.read_csv(shap_path).head(10) if os.path.exists(shap_path) else pd.DataFrame(columns=["feature","mean_abs_shap"])
local_df = pd.read_csv(local_path) if os.path.exists(local_path) else pd.DataFrame(columns=["sample_id","feature","shap_value","llm_summary"])
snapshot = json.load(open(snapshot_json)) if os.path.exists(snapshot_json) else {"rows":None,"cols":None}

def build_prompt(perf_lines, feat_lines, examples, snapshot):
    return f\"\"\"You are a senior data scientist. Write an executive summary (150–220 words) about a storm-damage model
(two targets: damage_property, damage_crops) using structured and text features embedded by a pretrained SentenceTransformer.

Data snapshot: rows={snapshot.get('rows')}, cols={snapshot.get('cols')}, missing_Y_PROP={snapshot.get('missing_Y_PROP')}, missing_Y_CROP={snapshot.get('missing_Y_CROP')}.

Performance (test):
{os.linesep.join(perf_lines) if perf_lines else "- (no metrics found)"}

Top global features (mean |SHAP|):
{os.linesep.join(feat_lines) if feat_lines else "- (no SHAP found)"}

Examples:
{os.linesep.join(examples) if examples else "- (no examples)"}

Include: What was modeled and why, performance meaning, key drivers (incl. narratives), limitations, and next steps.
Return 1–2 cohesive paragraphs (no bullets).\"\"\"

def heuristic_summary(perf_lines, feat_lines, snapshot):
    perf = "\\n".join(perf_lines) if perf_lines else "No test-set metrics available."
    feats = ", ".join([r.feature for _, r in shap_top.iterrows()]) if len(shap_top) else "no dominant features identified"
    return (f"This report describes a two-target regression system predicting property and crop damages from storm events, "
            f"combining structured variables with narrative embeddings from a pretrained SentenceTransformer. The dataset "
            f"contained roughly {snapshot.get('rows')} rows and {snapshot.get('cols')} columns. Test-set performance was:\\n{perf}\\n\\n"
            f"Global explainability indicates the model relies most on {feats}. Local analyses show that narrative cues about "
            f"structural destruction, flooding, and prolonged duration align with higher predicted losses, while location and "
            f"event-type effects capture exposure and hazard intensity. Limitations include potential noise or sparsity in text "
            f"fields, coarse geospatial resolution, and limited representation of small-to-medium events. Recommended next steps "
            f"include incorporating exposure/asset data (e.g., building density), richer geospatial covariates, and additional "
            f"training data to improve stability and calibration.")

perf_lines = []
if metrics and "top20_best" in metrics:
    mdf = pd.DataFrame(metrics["top20_best"]).T[["MAE","RMSE","R2"]]
    for target, row in mdf.iterrows():
        perf_lines.append(f"- {target}: MAE={row['MAE']:.0f}, RMSE={row['RMSE']:.0f}, R²={row['R2']:.3f}")
feat_lines = [f"- {r.feature}: {r.mean_abs_shap:.4f}" for _, r in shap_top.iterrows()]

examples = []
if not local_df.empty:
    for sid in sorted(local_df["sample_id"].unique())[:3]:
        chunk = local_df[local_df["sample_id"] == sid].sort_values("shap_value", key=np.abs, ascending=False).head(3)
        tops = "; ".join([f"{r.feature} ({r.shap_value:+.3f})" for _, r in chunk.iterrows()])
        txt  = shorten(str(chunk.iloc[0]["llm_summary"]), width=300, placeholder="…")
        examples.append(f"• Event {sid}: {tops} — {txt}")

if OPENAI_READY:
    try:
        client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        prompt = build_prompt(perf_lines, feat_lines, examples, snapshot)
        resp = client.chat.completions.create(model="gpt-4-turbo",
            messages=[{"role":"system","content":"You are a careful, precise data scientist writing executive summaries."},
                      {"role":"user","content":prompt}], temperature=0.2)
        summary_text = resp.choices[0].message.content.strip()
    except Exception as e:
        print("LLM failed, fallback:", e); summary_text = heuristic_summary(perf_lines, feat_lines, snapshot)
else:
    summary_text = heuristic_summary(perf_lines, feat_lines, snapshot)

with open(summary_txt, "w", encoding="utf-8") as f: f.write(summary_text)
print("✓ Executive summary saved to:", summary_txt)


In [None]:

# === 13) PDF Generation ===
import datetime

pdf_path = os.path.join(OUTDIR, "storm_damage_explainability_report.pdf")
styles = getSampleStyleSheet()
doc = SimpleDocTemplate(pdf_path, pagesize=A4)
story = []

exec_path = os.path.join(OUTDIR, "executive_summary.txt")
if os.path.exists(exec_path):
    story.append(Paragraph("<b>Executive Summary</b>", styles["Heading1"])); story.append(Spacer(1,10))
    for para in open(exec_path, "r", encoding="utf-8").read().strip().split("\\n\\n"):
        story.append(Paragraph(para.strip(), styles["BodyText"])); story.append(Spacer(1,10))
    story.append(Spacer(1,18))

story.append(Paragraph("<b>Storm Damage Prediction – Explainability Report</b>", styles["Title"]))
story.append(Spacer(1, 12))
story.append(Paragraph(f"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", styles["Normal"]))
story.append(Spacer(1, 18))

test_metrics_path = os.path.join(OUTDIR, "test_metrics.json")
if os.path.exists(test_metrics_path):
    story.append(Paragraph("<b>Model Performance on Test Set</b>", styles["Heading2"]))
    metrics = json.load(open(test_metrics_path))
    metrics_df = pd.DataFrame(metrics["top20_best"]).T.reset_index().rename(columns={"index":"Target"})
    table_data = [list(metrics_df.columns)] + metrics_df.values.tolist()
    table = Table(table_data, hAlign="LEFT")
    table.setStyle(TableStyle([("BACKGROUND",(0,0),(-1,0), colors.HexColor("#003366")),
                               ("TEXTCOLOR",(0,0),(-1,0), colors.whitesmoke),
                               ("ALIGN",(0,0),(-1,-1),"CENTER"),
                               ("GRID",(0,0),(-1,-1),0.25, colors.grey),
                               ("FONTNAME",(0,0),(-1,0),"Helvetica-Bold")]))
    story.append(table); story.append(Spacer(1,18))

shap_csv = os.path.join(OUTDIR, "shap_importance.csv")
combined_path = os.path.join(OUTDIR, "combined_shap_llm_report.csv")
assert os.path.exists(shap_csv) and os.path.exists(combined_path), "Run SHAP + LLM blocks first."

shap_df = pd.read_csv(shap_csv).head(20)
bar_plot_path = os.path.join(OUTDIR, "shap_barplot_global.png")

story.append(Paragraph("<b>Global Feature Importance (SHAP)</b>", styles["Heading2"]))
tdata = [list(shap_df.columns)] + shap_df.values.tolist()
table = Table(tdata, hAlign="LEFT")
table.setStyle(TableStyle([("BACKGROUND",(0,0),(-1,0), colors.HexColor("#003366")),
                           ("TEXTCOLOR",(0,0),(-1,0), colors.whitesmoke),
                           ("ALIGN",(0,0),(-1,-1),"CENTER"),
                           ("GRID",(0,0),(-1,-1),0.25, colors.grey),
                           ("FONTNAME",(0,0),(-1,0),"Helvetica-Bold")]))
story.append(table); story.append(Spacer(1,12))

if os.path.exists(bar_plot_path):
    story.append(Paragraph("<b>Global SHAP Bar Chart</b>", styles["Heading3"]))
    story.append(Image(bar_plot_path, width=400, height=250)); story.append(Spacer(1,12))

for tname in ["damage_property","damage_crops","shap_beeswarm_generic"]:
    p = os.path.join(OUTDIR, f"shap_beeswarm_{tname}.png") if "generic" not in tname else os.path.join(OUTDIR, "shap_beeswarm_generic.png")
    if os.path.exists(p):
        title = f"Beeswarm Plot — {tname.replace('_',' ').title() if 'generic' not in tname else 'Overall'}"
        story.append(Paragraph(f"<b>{title}</b>", styles["Heading3"]))
        story.append(Image(p, width=400, height=250)); story.append(Spacer(1,12))

combined_df = pd.read_csv(combined_path)
story.append(Paragraph("<b>Sample-Level Interpretations</b>", styles["Heading2"]))
for sid, group in combined_df.groupby("sample_id"):
    story.append(Paragraph(f"<b>Event {sid} – Local Explanation</b>", styles["Heading3"]))
    top_feats = group[["feature","shap_value"]].values.tolist()
    tdata = [["Feature","SHAP Value"]] + top_feats
    table = Table(tdata, hAlign="LEFT", colWidths=[220, 100])
    table.setStyle(TableStyle([("BACKGROUND",(0,0),(-1,0), colors.lightgrey),
                               ("GRID",(0,0),(-1,-1),0.25, colors.grey),
                               ("ALIGN",(1,1),(-1,-1),"CENTER")]))
    story.append(table); story.append(Spacer(1,6))
    story.append(Paragraph(f"<i>{group.iloc[0]['llm_summary']}</i>", styles["BodyText"])); story.append(Spacer(1,18))

doc.build(story)
print(f"✅ Full dual-target explainability report saved at: {pdf_path}")
