# 🌪️ Storm Damage Pipeline — **FAST 5.6** (GPU‑aware, cached embeddings, slim tuning)

Designed to finish quickly on mid‑range GPUs (e.g., RTX 3050 Ti). Tactics: cached MiniLM embeddings, sparse OHE, slim Optuna (6–10 trials), early stopping, reduced seq length.

**Outputs:** `./results`

In [None]:

# === 0) Setup & Utilities ===
import os, sys, math, json, time, hashlib, warnings
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import joblib

warnings.filterwarnings('ignore')

RESULTS = Path('./results'); RESULTS.mkdir(parents=True, exist_ok=True)
SEED = 42
MODE = os.getenv('STORM_MODE', 'fast')  # 'fast' | 'full'

def timer(label):
    class T:
        def __enter__(self_s): self_s.t0=time.time(); print(f'⏱️ {label} ...'); return self_s
        def __exit__(self_s, *exc): print(f'⏱️ {label}: {time.time()-self_s.t0:.2f}s')
    return T()

def save_json(obj, path): 
    path = Path(path); path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, 'w', encoding='utf-8') as f: json.dump(obj, f, indent=2)

def save_joblib(obj, path):
    path = Path(path); path.parent.mkdir(parents=True, exist_ok=True)
    joblib.dump(obj, path)

def has_cuda():
    try:
        import torch
        return torch.cuda.is_available()
    except Exception:
        return False

HAS_CUDA = has_cuda()
print('GPU available:', HAS_CUDA)


In [None]:

# === 1) Load Data ===
CSV_PATH = os.environ.get('STORM_CSV', './StormEvents_details-ftp_v1.0_d2013_c20250520.csv')
print('CSV_PATH =', CSV_PATH)

def parse_damage(v):
    if pd.isna(v): return np.nan
    s = str(v).strip().upper()
    if not s: return np.nan
    mult = 1
    if s.endswith('K'): mult, s = 1_000, s[:-1]
    elif s.endswith('M'): mult, s = 1_000_000, s[:-1]
    elif s.endswith('B'): mult, s = 1_000_000_000, s[:-1]
    try: return float(s) * mult
    except:
        try: return float(s.replace(',',''))
        except: return np.nan

df = pd.read_csv(CSV_PATH, low_memory=False, encoding='utf-8')
df.columns = [c.strip().upper() for c in df.columns]

assert 'DAMAGE_PROPERTY' in df.columns and 'DAMAGE_CROPS' in df.columns, 'Targets not found.'

df['Y_PROP'] = df['DAMAGE_PROPERTY'].apply(parse_damage)
df['Y_CROP'] = df['DAMAGE_CROPS'].apply(parse_damage)
df = df[(df['Y_PROP'].notna()) | (df['Y_CROP'].notna())].copy()

for c in ['BEGIN_DATE_TIME','END_DATE_TIME']:
    if c in df.columns:
        df[c] = pd.to_datetime(df[c], errors='coerce', infer_datetime_format=True)

df['DURATION_HOURS'] = (df['END_DATE_TIME'] - df['BEGIN_DATE_TIME']).dt.total_seconds()/3600
df['DURATION_HOURS'] = df['DURATION_HOURS'].clip(lower=0).fillna(0)

for c in ['BEGIN_LAT','BEGIN_LON','END_LAT','END_LON']:
    if c in df.columns: df[c] = pd.to_numeric(df[c], errors='coerce')
df['LAT_MEAN'] = df[['BEGIN_LAT','END_LAT']].mean(axis=1)
df['LON_MEAN'] = df[['BEGIN_LON','END_LON']].mean(axis=1)
df['LAT_SIN'] = np.sin(np.deg2rad(df['LAT_MEAN']))
df['LAT_COS'] = np.cos(np.deg2rad(df['LAT_MEAN']))
df['LON_SIN'] = np.sin(np.deg2rad(df['LON_MEAN']))
df['LON_COS'] = np.cos(np.deg2rad(df['LON_MEAN']))

for c in ['EPISODE_ID','EVENT_ID','DATA_SOURCE','BEGIN_RANGE','END_RANGE','BEGIN_AZIMUTH','END_AZIMUTH']:
    if c in df.columns: df.drop(columns=c, inplace=True)

print('Cleaned shape:', df.shape)
df.head(2)


In [None]:

# === 2) Feature Schema & Split ===
text_cols = [c for c in ['EVENT_NARRATIVE','EPISODE_NARRATIVE'] if c in df.columns]
num_cols = [c for c in ['INJURIES_DIRECT','INJURIES_INDIRECT','DEATHS_DIRECT','DEATHS_INDIRECT',
                        'DURATION_HOURS','LAT_MEAN','LON_MEAN','LAT_SIN','LAT_COS','LON_SIN','LON_COS']
            if c in df.columns]
cat_cols = [c for c in ['STATE','CZ_NAME','EVENT_TYPE'] if c in df.columns]

y = df[['Y_PROP','Y_CROP']].to_numpy(dtype='float32')
y = np.nan_to_num(y, nan=0.0, posinf=0.0, neginf=0.0)

X_tab = df[num_cols + cat_cols].copy()

X_train_tab, X_temp_tab, y_train, y_temp = train_test_split(X_tab, y, test_size=0.30, random_state=SEED)
X_valid_tab, X_test_tab, y_valid, y_test = train_test_split(X_temp_tab, y_temp, test_size=0.50, random_state=SEED)

texts = {c: df[c].astype(str).fillna('') for c in text_cols}
X_train_txt = {c: texts[c].loc[X_train_tab.index] for c in text_cols}
X_valid_txt = {c: texts[c].loc[X_valid_tab.index] for c in text_cols}
X_test_txt  = {c: texts[c].loc[X_test_tab.index]  for c in text_cols}

print('Splits ->', 'train:', X_train_tab.shape, 'valid:', X_valid_tab.shape, 'test:',  X_test_tab.shape)


In [None]:

# === 3) Cached MiniLM Embeddings ===
from sentence_transformers import SentenceTransformer
import torch, hashlib, numpy as np

EMBED_MODEL_NAME = os.getenv('EMBED_MODEL', 'all-MiniLM-L6-v2')
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
embed_model = SentenceTransformer(EMBED_MODEL_NAME, device=DEVICE)
try: embed_model.max_seq_length = 128
except Exception: pass

CACHE = Path('./results/embed_cache'); CACHE.mkdir(parents=True, exist_ok=True)

def md5_of_series(ser):
    h = hashlib.md5()
    for s in ser.astype(str).values:
        h.update(s.encode('utf-8', errors='ignore')); h.update(b'\n')
    return h.hexdigest()

def cache_embed(series):
    series = series.astype(str).fillna('')
    key = f"{EMBED_MODEL_NAME}_{len(series)}_{md5_of_series(series)}_{embed_model.get_sentence_embedding_dimension()}_{embed_model.max_seq_length}_{DEVICE}.npy"
    path = CACHE / key
    if path.exists():
        return np.load(path)
    data = series.tolist()
    out = []
    bs = 768 if DEVICE=='cuda' else 256
    i=0
    while i < len(data):
        j = min(i+bs, len(data)); chunk = data[i:j]
        try:
            with torch.inference_mode():
                em = embed_model.encode(chunk, batch_size=bs, convert_to_numpy=True,
                                        show_progress_bar=False, normalize_embeddings=False)
            out.append(em.astype('float32', copy=False)); i = j
        except RuntimeError as e:
            if 'CUDA out of memory' in str(e) and bs > 16 and DEVICE=='cuda':
                torch.cuda.empty_cache(); bs //= 2; print('[embed] OOM → batch_size =', bs)
            else:
                raise
    arr = np.vstack(out) if out else np.empty((0, embed_model.get_sentence_embedding_dimension()), dtype='float32')
    np.save(path, arr); return arr

emb_train = []; emb_valid = []; emb_test = []
for c in text_cols:
    with timer(f'Embed {c}'):
        emb_train.append(cache_embed(X_train_txt[c]))
        emb_valid.append(cache_embed(X_valid_txt[c]))
        emb_test.append(cache_embed(X_test_txt[c]))

EMB_DIM = embed_model.get_sentence_embedding_dimension() if text_cols else 0
print('Embedding dim:', EMB_DIM, '| #text cols:', len(text_cols))


In [None]:

# === 4) Tabular Preprocessing + Concatenate Embeddings ===
from scipy import sparse

num_pipe = Pipeline([('imputer', SimpleImputer(strategy='median')),
                     ('scaler', StandardScaler())])

cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                     ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=True, max_categories=100))])

pre_tab = ColumnTransformer([
    ('num', num_pipe, [c for c in X_train_tab.columns if c in num_cols]),
    ('cat', cat_pipe, [c for c in X_train_tab.columns if c in cat_cols])
], remainder='drop', sparse_threshold=0.3)

with timer('Fit tabular preproc'):
    Xt_train_tab = pre_tab.fit_transform(X_train_tab)
    Xt_valid_tab = pre_tab.transform(X_valid_tab)
    Xt_test_tab  = pre_tab.transform(X_test_tab)

def hstack_features(tab, embs_list):
    if not embs_list:
        return tab if sparse.issparse(tab) else sparse.csr_matrix(tab)
    tab_csr = tab if sparse.issparse(tab) else sparse.csr_matrix(tab)
    embs = np.concatenate(embs_list, axis=1)
    return sparse.hstack([tab_csr, sparse.csr_matrix(embs)], format='csr')

Xt_train = hstack_features(Xt_train_tab, emb_train)
Xt_valid = hstack_features(Xt_valid_tab, emb_valid)
Xt_test  = hstack_features(Xt_test_tab,  emb_test)
print('Xt shapes:', Xt_train.shape, Xt_valid.shape, Xt_test.shape)


In [None]:

# === 5) XGB Tweedie (GPU‑aware) + Slim Optuna ===
import xgboost as xgb, optuna

def sanitize_targets_nat(y_arr):
    y_arr = np.nan_to_num(y_arr, nan=0.0, posinf=0.0, neginf=0.0).astype('float32')
    y_arr[y_arr < 0] = 0.0
    return y_arr

y_prop_tr = sanitize_targets_nat(y_train[:,0]); y_prop_va = sanitize_targets_nat(y_valid[:,0]); y_prop_te = sanitize_targets_nat(y_test[:,0])
y_crop_tr = sanitize_targets_nat(y_train[:,1]); y_crop_va = sanitize_targets_nat(y_valid[:,1]); y_crop_te = sanitize_targets_nat(y_test[:,1])

def make_xgb_tweedie(base=None):
    params = dict(objective='reg:tweedie', tweedie_variance_power=1.4,
                  tree_method='hist', device=('cuda' if HAS_CUDA else 'cpu'),
                  predictor=('gpu_predictor' if HAS_CUDA else 'auto'),
                  random_state=SEED, n_jobs=-1, max_bin=256,
                  n_estimators=(420 if MODE=='fast' else 700),
                  max_depth=6, learning_rate=0.07 if MODE=='fast' else 0.05,
                  subsample=0.9, colsample_bytree=0.8, reg_lambda=1.0)
    if base: params.update(base)
    return xgb.XGBRegressor(**params)

def objective(trial, Xtr, ytr, Xva, yva):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 240, 600),
        'max_depth': trial.suggest_int('max_depth', 4, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.03, 0.12, log=True),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 3.0, log=True),
        'tweedie_variance_power': trial.suggest_float('tweedie_variance_power', 1.1, 1.9),
        'max_bin': trial.suggest_int('max_bin', 128, 512),
        'tree_method': 'hist',
        'device': ('cuda' if HAS_CUDA else 'cpu'),
        'predictor': ('gpu_predictor' if HAS_CUDA else 'auto'),
        'objective': 'reg:tweedie',
        'random_state': SEED,
        'n_jobs': -1
    }
    model = xgb.XGBRegressor(**params)
    model.fit(Xtr, ytr, eval_set=[(Xva, yva)], verbose=False, early_stopping_rounds=40)
    pred = model.predict(Xva)
    return float(mean_absolute_error(yva, pred))

def run_optuna(name, Xtr, ytr, Xva, yva, n_trials):
    study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=SEED))
    study.optimize(lambda t: objective(t, Xtr, ytr, Xva, yva), n_trials=n_trials, show_progress_bar=False)
    print(f'✅ {name} best MAE: {study.best_value:.3f}')
    save_json(study.best_params, RESULTS/f'optuna_tweedie_{name.lower()}.json')
    return study.best_params

TRIALS = 6 if MODE=='fast' else 20
with timer('Optuna: Property'):
    best_prop = run_optuna('Property', Xt_train, y_prop_tr, Xt_valid, y_prop_va, n_trials=TRIALS)
with timer('Optuna: Crop'):
    best_crop = run_optuna('Crop', Xt_train, y_crop_tr, Xt_valid, y_crop_va, n_trials=TRIALS)

xgb_prop = make_xgb_tweedie(best_prop)
xgb_crop = make_xgb_tweedie(best_crop)
with timer('Fit final Property'):
    xgb_prop.fit(Xt_train, y_prop_tr, eval_set=[(Xt_valid, y_prop_va)], verbose=False, early_stopping_rounds=50)
with timer('Fit final Crop'):
    xgb_crop.fit(Xt_train, y_crop_tr, eval_set=[(Xt_valid, y_crop_va)], verbose=False, early_stopping_rounds=50)

def eval_metrics(y_true, y_pred):
    return {'MAE': float(mean_absolute_error(y_true, y_pred)),
            'RMSE': float(np.sqrt(mean_squared_error(y_true, y_pred))),
            'R2': float(r2_score(y_true, y_pred))}

metrics = {
    'valid_property': eval_metrics(y_prop_va, xgb_prop.predict(Xt_valid)),
    'valid_crop':     eval_metrics(y_crop_va, xgb_crop.predict(Xt_valid)),
    'test_property':  eval_metrics(y_prop_te, xgb_prop.predict(Xt_test)),
    'test_crop':      eval_metrics(y_crop_te, xgb_crop.predict(Xt_test)),
}
save_json(metrics, RESULTS/'metrics_fast_tweedie.json')
save_joblib(xgb_prop, RESULTS/'xgb_tweedie_property_fast.joblib')
save_joblib(xgb_crop, RESULTS/'xgb_tweedie_crop_fast.joblib')
print('✅ Saved tuned XGB Tweedie models & metrics →', RESULTS)


In [None]:

# === 6) Optional SHAP (small sample) ===
DO_SHAP = os.getenv('DO_SHAP','0') == '1'
if DO_SHAP:
    import shap, scipy.sparse as sp
    nsamp = min(1500, Xt_valid.shape[0])
    Xv = Xt_valid[:nsamp].toarray() if hasattr(Xt_valid, 'toarray') else Xt_valid[:nsamp]
    explainer = shap.TreeExplainer(xgb_prop)
    sv = explainer.shap_values(Xv)
    np.save(RESULTS/'shap_values_property_fast.npy', sv)
    print('✅ Saved SHAP →', RESULTS/'shap_values_property_fast.npy')
else:
    print('ℹ️ SHAP skipped. Set DO_SHAP=1 to enable.')
