
# 🌪️ Storm Damage Prediction — v4.2 (GPU‑Accelerated)

End‑to‑end pipeline to predict **Property** and **Crop** damages from NOAA Storm Events using:
- Text + tabular features (SentenceTransformer embeddings + engineered features)
- Fast **GPU XGBoost** training & prediction
- **SHAP** (GPU TreeSHAP) for feature importance
- Quantile intervals via residual bootstrapping

**Outputs:** written to `./results`.


In [1]:

# === 1) Setup & Config ===
import os, math, json, gc, random, time
from pathlib import Path
import numpy as np
import pandas as pd

# Reproducibility
SEED = 42
random.seed(SEED); np.random.seed(SEED)

# Runtime mode: '2h', '5h', 'full'
MODE = os.environ.get("PIPELINE_MODE", "2h")

# Paths
OUTDIR = Path("./results")
OUTDIR.mkdir(parents=True, exist_ok=True)

# GPU / CUDA
import torch
HAS_CUDA = torch.cuda.is_available()
DEVICE = "cuda" if HAS_CUDA else "cpu"
print(f"MODE={MODE} | HAS_CUDA={HAS_CUDA} | DEVICE={DEVICE}")

# SentenceTransformer model (384‑dim, fast & light)
EMBED_MODEL_NAME = os.environ.get("EMBED_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2")

# Simple timer
from contextlib import contextmanager
@contextmanager
def timer(msg: str):
    t0 = time.time()
    print(f"⏱️ {msg} ...", flush=True)
    try:
        yield
    finally:
        dt = time.time() - t0
        print(f"⏱️ {msg}: {dt:.2f}s", flush=True)


MODE=2h | HAS_CUDA=True | DEVICE=cuda


In [2]:

# === 2) Load & Clean ===
# Set your CSV path here:
CSV_PATH = os.environ.get("CSV_PATH", "StormEvents_details-ftp_v1.0_d2013_c20250520.csv")

def parse_damage(v):
    if pd.isna(v): return np.nan
    s = str(v).strip().upper()
    if not s: return np.nan
    mult = 1
    if s.endswith('K'): mult, s = 1_000, s[:-1]
    elif s.endswith('M'): mult, s = 1_000_000, s[:-1]
    elif s.endswith('B'): mult, s = 1_000_000_000, s[:-1]
    try:
        return float(s) * mult
    except:
        try:
            return float(s.replace(',',''))
        except:
            return np.nan

with timer("Load CSV"):
    df = pd.read_csv(CSV_PATH, low_memory=False, encoding='utf-8')

df.columns = [c.strip().upper() for c in df.columns]

# Basic drop of IDs we won't use
for dropc in ['EPISODE_ID','EVENT_ID','DATA_SOURCE']:
    if dropc in df.columns:
        df.drop(columns=dropc, inplace=True)

assert 'DAMAGE_PROPERTY' in df.columns and 'DAMAGE_CROPS' in df.columns, "CSV missing DAMAGE_PROPERTY / DAMAGE_CROPS"

# Targets
df['Y_PROP'] = df['DAMAGE_PROPERTY'].apply(parse_damage)
df['Y_CROP'] = df['DAMAGE_CROPS'].apply(parse_damage)
df = df[(df['Y_PROP'].notna()) | (df['Y_CROP'].notna())].copy()

# Dates
def to_dt(series):
    return pd.to_datetime(series, errors='coerce', infer_datetime_format=True)
for c in ['BEGIN_DATE_TIME','END_DATE_TIME']:
    if c in df.columns: df[c] = to_dt(df[c])

df['DURATION_HOURS'] = (df['END_DATE_TIME'] - df['BEGIN_DATE_TIME']).dt.total_seconds()/3600
df['DURATION_HOURS'] = df['DURATION_HOURS'].clip(lower=0).fillna(0)

# Geo to numeric
for c in ['BEGIN_LAT','BEGIN_LON','END_LAT','END_LON']:
    if c in df.columns: df[c] = pd.to_numeric(df[c], errors='coerce')

df['LAT_MEAN'] = df[['BEGIN_LAT','END_LAT']].mean(axis=1)
df['LON_MEAN'] = df[['BEGIN_LON','END_LON']].mean(axis=1)

# Trig features
df['LAT_SIN'] = np.sin(np.deg2rad(df['LAT_MEAN']))
df['LAT_COS'] = np.cos(np.deg2rad(df['LAT_MEAN']))
df['LON_SIN'] = np.sin(np.deg2rad(df['LON_MEAN']))
df['LON_COS'] = np.cos(np.deg2rad(df['LON_MEAN']))

# Drop some detailed geo/location if present
for c in ['BEGIN_LAT','BEGIN_LON','END_LAT','END_LON','BEGIN_LOCATION','END_LOCATION','BEGIN_AZIMUTH','END_AZIMUTH','BEGIN_RANGE','END_RANGE']:
    if c in df.columns:
        df.drop(columns=c, inplace=True)

print("Cleaned shape:", df.shape)


⏱️ Load CSV ...
⏱️ Load CSV: 0.90s


  return pd.to_datetime(series, errors='coerce', infer_datetime_format=True)
  return pd.to_datetime(series, errors='coerce', infer_datetime_format=True)
  return pd.to_datetime(series, errors='coerce', infer_datetime_format=True)
  return pd.to_datetime(series, errors='coerce', infer_datetime_format=True)


Cleaned shape: (52259, 47)


In [3]:

# === 3) Feature Lists & Split ===
from sklearn.model_selection import train_test_split

text_cols = [c for c in ['EPISODE_NARRATIVE','EVENT_NARRATIVE'] if c in df.columns]
cat_cols  = [c for c in ['STATE','EVENT_TYPE','CZ_TYPE','CZ_NAME','LOCATION_NAME'] if c in df.columns]
num_cols  = [c for c in [
    'INJURIES_DIRECT','INJURIES_INDIRECT','DEATHS_DIRECT','DEATHS_INDIRECT','DURATION_HOURS',
    'LAT_MEAN','LON_MEAN','LAT_SIN','LAT_COS','LON_SIN','LON_COS'
] if c in df.columns]

X_cols = num_cols + cat_cols + text_cols
df_model = df[X_cols + ['Y_PROP','Y_CROP']].copy()

y = np.column_stack([df_model['Y_PROP'].fillna(0).values, df_model['Y_CROP'].fillna(0).values])
y_log = np.log1p(y)

X = df_model[X_cols].copy()
X_train, X_temp, y_train_log, y_temp_log = train_test_split(X, y_log, test_size=0.30, random_state=42)
X_valid, X_test, y_valid_log, y_test_log = train_test_split(X_temp, y_temp_log, test_size=0.50, random_state=42)

print('Split sizes:', len(X_train), len(X_valid), len(X_test))


Split sizes: 36581 7839 7839


In [None]:
# === 4) Preprocessing (GPU-safe embeddings + memory-safe OHE) ===
import os
os.environ.setdefault('TRANSFORMERS_NO_TORCHVISION','1')
from sentence_transformers import SentenceTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer

embed_model = SentenceTransformer(
    EMBED_MODEL_NAME,
    device=("cuda" if torch.cuda.is_available() else "cpu")
)

try:
    embed_model.max_seq_length = 256
except Exception:
    pass

def embed_text_batched(X, batch_size=512):
    if isinstance(X, pd.DataFrame):
        seq = X.iloc[:, 0].astype(str).fillna('').tolist()
    elif isinstance(X, pd.Series):
        seq = X.astype(str).fillna('').tolist()
    else:
        seq = [str(t) for t in X]

    outputs = []
    i = 0
    bs = batch_size

    while i < len(seq):
        j = min(i + bs, len(seq))
        chunk = seq[i:j]
        try:
            with torch.inference_mode():
                embs = embed_model.encode(
                    chunk,
                    batch_size=bs,
                    convert_to_numpy=True,
                    show_progress_bar=False,
                    normalize_embeddings=False
                )
            outputs.append(embs.astype('float32', copy=False))
            i = j

        except RuntimeError as e:
            if 'CUDA out of memory' in str(e) and bs > 8 and torch.cuda.is_available():
                torch.cuda.empty_cache()
                bs = max(8, bs // 2)
                print(f"[embed_text] OOM → reducing batch_size to {bs}")
            else:
                raise

    if not outputs:
        dim = embed_model.get_sentence_embedding_dimension()
        return np.empty((0, dim), dtype=np.float32)

    return np.vstack(outputs)

# Numeric pipeline
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# ✅ Categorical pipeline (memory-safe OHE)
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(
        handle_unknown='ignore',
        sparse_output=False,     # ✅ fixed argument
        dtype=np.float32,
        max_categories=200       # ✅ memory-safe cap
    ))
])

# Build column transformer
transformers = []
if num_cols:
    transformers.append(('num', num_pipe, num_cols))
if cat_cols:
    transformers.append(('cat', cat_pipe, cat_cols))
for c in text_cols:
    transformers.append((f'text_{c}', FunctionTransformer(embed_text_batched, validate=False), [c]))

preprocess = ColumnTransformer(
    transformers=transformers,
    remainder='drop',
    n_jobs=1
)

print(f"✅ Preprocess ready. Text model: {EMBED_MODEL_NAME} on {('cuda' if torch.cuda.is_available() else 'cpu')}")


✅ Preprocess ready. Text model: sentence-transformers/all-MiniLM-L6-v2 on cuda


In [6]:

# === 5) Sanity Check ===
print("Numeric:", num_cols[:8], "..." if len(num_cols)>8 else "")
print("Categorical:", cat_cols[:8], "..." if len(cat_cols)>8 else "")
print("Text:", text_cols)


Numeric: ['INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT', 'DURATION_HOURS', 'LAT_MEAN', 'LON_MEAN', 'LAT_SIN'] ...
Categorical: ['STATE', 'EVENT_TYPE', 'CZ_TYPE', 'CZ_NAME'] 
Text: ['EPISODE_NARRATIVE', 'EVENT_NARRATIVE']


In [7]:

# === 6) First-Pass Model Evaluation (GPU-accelerated) ===
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import Ridge
import xgboost as xgb

print("🔧 Step 6 — Optimized GPU-Accelerated Evaluation")
OUTDIR.mkdir(parents=True, exist_ok=True)

with timer("Precompute preprocess on train/valid (GPU embeddings)"):
    Xt_train = preprocess.fit_transform(X_train, y_train_log).astype("float32")
    Xt_valid = preprocess.transform(X_valid).astype("float32")

feature_names = []
for c in num_cols: feature_names.append(f"num__{c}")
for c in cat_cols: feature_names.append(f"cat__{c}_oh")
embed_dim = 384  # all-MiniLM-L6-v2
try:
    embed_dim = embed_model.get_sentence_embedding_dimension()
except Exception:
    pass
for c in text_cols:
    feature_names.extend([f"text__{c}_emb_{i}" for i in range(embed_dim)])
np.save("feature_names.npy", np.array(feature_names, dtype=object))
print("✅ Saved feature_names.npy")

def metrics_frame(y_true, y_pred, labels):
    out={}
    for i,name in enumerate(labels):
        out[name]=dict(
            MAE=float(mean_absolute_error(y_true[:,i], y_pred[:,i])),
            RMSE=float(math.sqrt(mean_squared_error(y_true[:,i], y_pred[:,i]))),
            R2=float(r2_score(y_true[:,i], y_pred[:,i])),
        )
    return pd.DataFrame(out).T

if MODE == "2h":
    MAX_ESTIMATORS, MAX_DEPTH, LEARNING_RATE = 350, 6, 0.07
elif MODE == "5h":
    MAX_ESTIMATORS, MAX_DEPTH, LEARNING_RATE = 700, 7, 0.05
else:
    MAX_ESTIMATORS, MAX_DEPTH, LEARNING_RATE = 1200, 8, 0.045

def make_ridge(alpha=3.0):
    return Ridge(alpha=alpha, fit_intercept=True, random_state=SEED)

def make_xgb(n_estimators=MAX_ESTIMATORS, max_depth=MAX_DEPTH, learning_rate=LEARNING_RATE, subsample=0.9, colsample_bytree=0.9):
    params=dict(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=SEED,
        n_jobs=-1
    )
    if torch.cuda.is_available():
        params.update(dict(tree_method="gpu_hist", predictor="gpu_predictor", device="cuda"))
        print("⚡ XGB: Using GPU")
    else:
        params.update(dict(tree_method="hist", predictor="cpu_predictor", device="cpu"))
        print("⚠️ XGB: Using CPU")
    return xgb.XGBRegressor(**params)

def eval_on_split(model, Xtr, ytr_log, Xev, yev_log, label="eval"):
    with timer(f"fit {label}"):
        model.fit(Xtr, ytr_log)
    yhat_log = model.predict(Xev)
    yhat = np.expm1(yhat_log)
    ytrue = np.expm1(yev_log)
    mf = metrics_frame(ytrue, yhat, ["damage_property", "damage_crops"])
    print(f"\n📊 {label} results:\n{mf}\n")
    return mf, yhat

val_results = {}
print("🚀 Running first‑pass…")
for name, model in {"ridge": make_ridge(), "xgb": make_xgb()}.items():
    mf, _ = eval_on_split(model, Xt_train, y_train_log, Xt_valid, y_valid_log, f"{name} (validation)")
    val_results[name] = mf

path = OUTDIR / "validation_metrics_first_pass.json"
with open(path, "w") as f:
    json.dump({k: v.to_dict(orient="index") for k, v in val_results.items()}, f, indent=2)
print("✅ Saved first-pass metrics to", path)


🔧 Step 6 — Optimized GPU-Accelerated Evaluation
⏱️ Precompute preprocess on train/valid (GPU embeddings) ...
⏱️ Precompute preprocess on train/valid (GPU embeddings): 368.72s
✅ Saved feature_names.npy
🚀 Running first‑pass…
⚡ XGB: Using GPU
⏱️ fit ridge (validation) ...
⏱️ fit ridge (validation): 0.34s

📊 ridge (validation) results:
                          MAE          RMSE           R2
damage_property  2.084303e+06  1.734214e+08 -4040.446716
damage_crops     1.098022e+05  2.624504e+06    -0.001658

⏱️ fit xgb (validation) ...



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.



⏱️ fit xgb (validation): 56.63s



    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.





📊 xgb (validation) results:
                           MAE          RMSE        R2
damage_property  123648.468662  2.724689e+06  0.002379
damage_crops      56287.413170  2.375631e+06  0.179303

✅ Saved first-pass metrics to results\validation_metrics_first_pass.json


In [None]:

# === 7) Final GPU XGBoost Model (Full Training) ===
import xgboost as xgb

print("🔧 Step 7 — Final GPU XGB Training")

def make_xgb_final():
    params = dict(
        n_estimators=1500 if MODE=='full' else (700 if MODE=='5h' else 500),
        max_depth=8 if MODE!='2h' else 6,
        learning_rate=0.045 if MODE!='2h' else 0.06,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=SEED,
        n_jobs=-1
    )
    if torch.cuda.is_available():
        params.update(dict(tree_method="gpu_hist", predictor="gpu_predictor", device="cuda"))
        print("⚡ Training final model on GPU")
    else:
        params.update(dict(tree_method="hist", predictor="cpu_predictor", device="cpu"))
        print("⚠️ GPU not available — training on CPU")
    return xgb.XGBRegressor(**params)

final_model = make_xgb_final()
with timer("Fit final XGB model (GPU-accelerated)"):
    final_model.fit(Xt_train, y_train_log)

print("✅ Final GPU XGB model trained")


🔧 Step 7 — Final GPU XGB Training
⚡ Training final model on GPU
⏱️ Fit final XGB model (GPU-accelerated) ...



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.



In [None]:

# === 8) SHAP Analysis (GPU TreeSHAP) ===
import shap

print("🔧 Step 8 — SHAP Global Interpretation")

feature_names = np.load("feature_names.npy", allow_pickle=True)

shap_sample_size = min(4000, Xt_train.shape[0])
shap_idx = np.random.choice(len(Xt_train), shap_sample_size, replace=False)
Xt_shap = Xt_train[shap_idx]

with timer("Compute SHAP values (GPU)"):
    explainer = shap.TreeExplainer(final_model, feature_perturbation="tree_path_dependent")
    shap_values = explainer.shap_values(Xt_shap)

np.save(OUTDIR / "shap_values.npy", shap_values)
np.save(OUTDIR / "shap_sample.npy", Xt_shap)
print("✅ SHAP values computed and saved")


In [None]:

# === 9) Final Predictions + Quantile Intervals (GPU) ===
print("🔧 Step 9 — Final Predictions & Uncertainty")

with timer("Predict on validation set (GPU)"):
    yhat_log_valid = final_model.predict(Xt_valid)
    yhat_valid = np.expm1(yhat_log_valid)
    ytrue_valid = np.expm1(y_valid_log)

resid = ytrue_valid - yhat_valid
QUANTILES = [0.1, 0.3, 0.5, 0.7, 0.9]
qvals = np.quantile(resid, QUANTILES, axis=0)

results = pd.DataFrame({
    "ytrue_property": ytrue_valid[:,0],
    "ypred_property": yhat_valid[:,0],
    "ytrue_crop": ytrue_valid[:,1],
    "ypred_crop": yhat_valid[:,1],
})

for q, val in zip(QUANTILES, qvals):
    results[f"q{int(q*100)}_prop"] = yhat_valid[:,0] + val[0]
    results[f"q{int(q*100)}_crop"] = yhat_valid[:,1] + val[1]

out_csv = OUTDIR / "test_quantile_predictions.csv"
results.to_csv(out_csv, index=False)
print("✅ Saved:", out_csv)


In [None]:

# === 10) Preview artifacts ===
from pprint import pprint
print("Artifacts in results/:")
pprint(sorted([p.name for p in OUTDIR.glob("*")]))
display(pd.read_csv(OUTDIR/"test_quantile_predictions.csv").head(5))
