# BoXHED vs LSTM (누수 완화 버전: 미래 예측 + 이벤트 이후 제거)

## 예측 정의(두 모델 동일 적용)
- **대상 단위**: `stay_id`
- **시간 축**: `t` (시간 인덱스)
- **원천 이벤트 컬럼**: `event`(우선) 또는 `delta`
- **미래 예측 라벨**: 각 시점 `t`에서 **(t, t+H] 구간(H=`HORIZON_HOURS`) 안에 이벤트가 1회라도 발생하면 1**, 아니면 0
  - 동시 시점(event at t)은 ‘예측’이 아니므로 라벨 계산에서 제외
- **이벤트 이후 구간 제거**: 각 stay에서 **첫 이벤트 시점 `first_event_t` 이후(t >= first_event_t) row는 제거**하여, 이벤트 이후 정보로 학습/평가하는 문제를 줄임

## 평가(두 모델 동일 기준)
- row-level score(`row_score`)를 `t <= CUTOFF_HOURS` 구간에서 `AGG_MODE`로 stay-level로 집계
- stay-level 정답은 **`t <= (CUTOFF_HOURS + HORIZON_HOURS)` 내 실제 이벤트 발생 여부**
- threshold는 validation에서 **목표 recall(TARGET_RECALL) 이상 중 precision 최대**로 선택 후 test에 동일 적용


In [1]:
# 환경 준비
!pip -q install -U "numpy<2"
!pip -q install -U boxhed
!pip -q install -U tensorflow

import sys
from pathlib import Path
import numpy as np
import pandas as pd

# numpy 2.x 호환 꼬임 방지
if not hasattr(np, "cfloat"):
    np.cfloat = np.complex128

from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    precision_recall_curve, confusion_matrix,
    precision_score, recall_score, f1_score, accuracy_score
)
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras import layers, models

from boxhed.boxhed import boxhed


In [2]:
# 데이터 로드
IN_COLAB = "google.colab" in sys.modules
DRIVE_SOURCE_FOLDER = "/content/drive/MyDrive/mini/박소현/boxhed_io"  # 필요 시 수정

if IN_COLAB:
    from google.colab import drive
    drive.mount("/content/drive")

base = Path(DRIVE_SOURCE_FOLDER)
train_path = base / "train_df.csv"
valid_path = base / "valid_df.csv"
test_path  = base / "test_df.csv"

for p in [train_path, valid_path, test_path]:
    if not p.exists():
        raise FileNotFoundError(f"파일이 없음: {p}")

train_df = pd.read_csv(train_path)
valid_df = pd.read_csv(valid_path)
test_df  = pd.read_csv(test_path)

print("Loaded:", train_df.shape, valid_df.shape, test_df.shape)
print("Columns:", len(train_df.columns))


Mounted at /content/drive
Loaded: (3987360, 15) (976560, 15) (1243080, 15)
Columns: 15


In [3]:
# 공통 스키마/설정
ID_COL = "stay_id"
TIME_COL = "t"
EVENT_COL = "event" if "event" in train_df.columns else "delta"
WEIGHT_COL = "sample_weight" if "sample_weight" in train_df.columns else None

# ===== 목표 정합 설정 =====
HORIZON_HOURS = 6          # 시점 t에서 (t, t+H] 이내 이벤트
DROP_AFTER_EVENT = True    # 이벤트 이후 관측 제거 (t >= first_event_t 제거)
CUTOFF_HOURS = 24
AGG_MODE = "max"           # {"max","mean","last"}
TARGET_RECALL = 0.80

import numpy as np
import pandas as pd

def add_future_label(df: pd.DataFrame, id_col=ID_COL, time_col=TIME_COL, event_col=EVENT_COL, horizon_hours=None):
    """
    각 row(t)에서 (t, t+H] 이내 이벤트가 있으면 1.
    (동시시점 event는 예측이 아니므로 제외)
    """
    out = df.copy()
    out["_future_label"] = 0
    if horizon_hours is None:
        horizon_hours = HORIZON_HOURS

    for sid, g in out.sort_values([id_col, time_col]).groupby(id_col, sort=False):
        t = g[time_col].to_numpy()
        e = g[event_col].to_numpy().astype(int)
        ev_times = t[e == 1]
        if ev_times.size == 0:
            continue

        left = np.searchsorted(ev_times, t, side="right")
        right = np.searchsorted(ev_times, t + horizon_hours, side="right")
        out.loc[g.index, "_future_label"] = ((right - left) > 0).astype(int)

    return out

def add_label_observable_mask(df: pd.DataFrame, id_col=ID_COL, time_col=TIME_COL, label_col="_future_label",
                              horizon_hours=None):
    """
    우측 검열 처리:
    - label==1 이면 관측 가능(True)
    - label==0 이면, 최소 t+H까지 관측이 존재해야 관측 가능(True)
    """
    out = df.copy()
    if horizon_hours is None:
        horizon_hours = HORIZON_HOURS

    t_last = out.groupby(id_col)[time_col].max().rename("_t_last")
    out = out.merge(t_last, on=id_col, how="left")
    out["_label_observable"] = (out[label_col] == 1) | (out[time_col] + horizon_hours <= out["_t_last"])
    out = out.drop(columns=["_t_last"])
    return out

def drop_rows_after_first_event(df: pd.DataFrame, id_col=ID_COL, time_col=TIME_COL, event_col=EVENT_COL):
    """첫 이벤트 시점 이상(t >= first_event_t) row 제거."""
    out = df.copy()
    first_t = (
        out.loc[out[event_col].astype(int) == 1]
           .groupby(id_col)[time_col]
           .min()
    )
    out = out.merge(first_t.rename("_first_event_t"), on=id_col, how="left")
    keep = out["_first_event_t"].isna() | (out[time_col] < out["_first_event_t"])
    out = out.loc[keep].drop(columns=["_first_event_t"])
    return out

# 1) 미래 라벨 생성
train_df = add_future_label(train_df, horizon_hours=HORIZON_HOURS)
valid_df = add_future_label(valid_df, horizon_hours=HORIZON_HOURS)
test_df  = add_future_label(test_df,  horizon_hours=HORIZON_HOURS)

# 2) 우측 검열 마스크 생성 후, 관측 불가능 row 제거(또는 weight=0 처리 가능)
train_df = add_label_observable_mask(train_df, horizon_hours=HORIZON_HOURS)
valid_df = add_label_observable_mask(valid_df, horizon_hours=HORIZON_HOURS)
test_df  = add_label_observable_mask(test_df,  horizon_hours=HORIZON_HOURS)

train_df = train_df.loc[train_df["_label_observable"]].copy()
valid_df = valid_df.loc[valid_df["_label_observable"]].copy()
test_df  = test_df.loc[test_df["_label_observable"]].copy()

# 3) 이벤트 이후 정보 제거(누수 방지)
if DROP_AFTER_EVENT:
    train_df = drop_rows_after_first_event(train_df)
    valid_df = drop_rows_after_first_event(valid_df)
    test_df  = drop_rows_after_first_event(test_df)

LABEL_COL = "_future_label"  # 모든 모델 공통 타깃(미래 H시간 라벨)

# (옵션) 운영 가정: 학습을 cutoff로 자를지
APPLY_CUTOFF_TO_TRAIN = False
if APPLY_CUTOFF_TO_TRAIN:
    train_df = train_df.loc[train_df[TIME_COL] <= CUTOFF_HOURS].copy()


In [4]:

# 공통 평가 유틸
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve

def safe_auc_ap(y, s):
    y = np.asarray(y).astype(int)
    s = np.asarray(s).astype(float)
    if len(np.unique(y)) < 2:
        return np.nan, np.nan
    return roc_auc_score(y, s), average_precision_score(y, s)

def stay_level_from_row(df_long: pd.DataFrame, row_score: np.ndarray,
                        id_col=ID_COL, time_col=TIME_COL,
                        cutoff_hours=CUTOFF_HOURS,
                        agg=AGG_MODE,
                        label_col=LABEL_COL,
                        horizon_hours=None,
                        **kwargs):
    """    Convert row-level scores to stay-level labels/scores under the SAME evaluation frame.

    IMPORTANT:
      - stay-level ground truth is derived from `label_col` (e.g., _future_label),
        not from raw `event`, so that DROP_AFTER_EVENT does not zero-out positives.
      - aggregation is computed within `t <= cutoff_hours`.
    """
    d = df_long.sort_values([id_col, time_col]).copy()
    d_cut = d.loc[d[time_col] <= cutoff_hours].copy()
    d_cut["_row_score"] = row_score

    if agg == "max":
        s_stay = d_cut.groupby(id_col)["_row_score"].max()
    elif agg == "mean":
        s_stay = d_cut.groupby(id_col)["_row_score"].mean()
    elif agg == "last":
        s_stay = d_cut.groupby(id_col)["_row_score"].last()
    else:
        raise ValueError(f"Unknown agg: {agg}")

    y_stay = d_cut.groupby(id_col)[label_col].max().astype(int)

    common = s_stay.index.intersection(y_stay.index)
    return y_stay.loc[common].to_numpy(), s_stay.loc[common].to_numpy()

def pick_threshold_by_recall_then_precision(y, s, target_recall=TARGET_RECALL):
    """validation에서 target_recall 이상인 threshold 중 precision 최대 선택."""
    precision, recall, thr = precision_recall_curve(y, s)
    # precision_recall_curve는 thr 길이가 (n-1)라서 정렬 맞추기
    # thr에 대해 recall/precision을 맞추기 위해 마지막 원소 제거
    precision = precision[:-1]
    recall = recall[:-1]

    ok = recall >= target_recall
    if not np.any(ok):
        # 타겟 recall 달성 불가면 recall 최대인 지점 선택
        i = int(np.argmax(recall))
        return float(thr[i]), float(precision[i]), float(recall[i])

    cand = np.where(ok)[0]
    # precision 최대, 동률이면 threshold 더 큰 것(보수적)
    best = cand[np.lexsort((-thr[cand], -precision[cand]))][0]
    return float(thr[best]), float(precision[best]), float(recall[best])

def evaluate_split(name, df, row_score):
    y, s = stay_level_from_row(df, row_score)
    auc, ap = safe_auc_ap(y, s)
    return {"split": name, "auc": auc, "ap": ap, "n_stay": int(len(y))}


In [5]:
# =============================
# Helper functions (self-contained)
# =============================

import numpy as np

def stay_score_with_cutoff(df_long, row_score, cutoff_hours=CUTOFF_HOURS, agg=AGG_MODE):
    if len(row_score) != len(df_long):
        raise ValueError(f"Length mismatch: row_score={len(row_score)} df={len(df_long)}")

    d = df_long.copy()
    d["_row_score"] = np.asarray(row_score, dtype=np.float32)
    d = d.sort_values([ID_COL, TIME_COL])
    d_cut = d.loc[d[TIME_COL] <= cutoff_hours].copy()

    if agg == "max":
        s_stay = d_cut.groupby(ID_COL)["_row_score"].max()
    elif agg == "mean":
        s_stay = d_cut.groupby(ID_COL)["_row_score"].mean()
    elif agg == "last":
        s_stay = d_cut.groupby(ID_COL)["_row_score"].last()
    else:
        raise ValueError(f"Unknown agg: {agg}")

    y_stay = d_cut.groupby(ID_COL)[LABEL_COL].max().astype(int)
    common = s_stay.index.intersection(y_stay.index)
    return y_stay.loc[common].to_numpy(), s_stay.loc[common].to_numpy()

def apply_class_weight(w, y):
    y = np.asarray(y).astype(int)
    n_pos = (y == 1).sum()
    n_neg = (y == 0).sum()
    if n_pos == 0:
        raise ValueError("No positive labels in training.")
    pos_mult = n_neg / n_pos
    w2 = np.ones_like(y, dtype=np.float32) if w is None else np.asarray(w, dtype=np.float32).copy()
    w2[y == 1] *= float(pos_mult)
    return w2

def hazard_to_1d(hz):
    hz = np.asarray(hz)
    if hz.ndim == 1:
        return hz.astype(np.float32)
    hz = hz.reshape(hz.shape[0], -1)
    return hz.max(axis=1).astype(np.float32)

def sigmoid(x):
    x = np.asarray(x, dtype=np.float32)
    x = np.clip(x, -50.0, 50.0)
    return 1.0 / (1.0 + np.exp(-x))

def predict_row_score_boxhed_df(model, df_eval, feature_cols):
    """
    BoXHED를 '미래 H시간 라벨' 분류기로 사용:
    - hazard 출력(스코어)을 sigmoid로 확률화 (단조 변환이므로 AUC/AP에 안정적)
    """
    X = df_eval[feature_cols].to_numpy(dtype=np.float32)
    hz = hazard_to_1d(model.hazard(X))
    hz = np.nan_to_num(hz, nan=0.0, posinf=50.0, neginf=-50.0)
    p = sigmoid(hz).astype(np.float32)
    if len(p) != len(df_eval):
        raise RuntimeError("BoXHED row_score length mismatch")
    return p


In [7]:
# 피처 정의 (LSTM도 동일 피처 사용)
# 기존 노트북에서 'ORIGINAL FEATURES ONLY' 의도대로: id/time/label/weight + 일부 파생지표는 제외
EXTRA_DROP = {"calc_DiasBP", "ShockIndex", "PulsePressure", "ModShockIndex", "ROX_Index","_label_observable"}
DROP_COLS = {ID_COL, TIME_COL, "event", "delta", LABEL_COL} | (set([WEIGHT_COL]) if WEIGHT_COL else set()) | EXTRA_DROP

FEATURE_COLS = [c for c in train_df.columns if c not in DROP_COLS]
if len(FEATURE_COLS) == 0:
    raise RuntimeError("피처 컬럼이 0개")

print(f"n_features={len(FEATURE_COLS)}")
print(FEATURE_COLS)
print(train_df.columns)

n_features=12
['HeartRate_std_6h', 'GCS_Verbal', 'SpO2_measured', 'RespRate_std_6h', 'SysBP', 'GCS_Motor', 'GCS_Total_mean_6h', 'Temp_std_6h', 'pH', 'DiasBP_mean_6h', 'MeanBP', 'FiO2']
Index(['stay_id', 't', 'event', 'HeartRate_std_6h', 'GCS_Verbal',
       'SpO2_measured', 'RespRate_std_6h', 'SysBP', 'GCS_Motor',
       'GCS_Total_mean_6h', 'Temp_std_6h', 'pH', 'DiasBP_mean_6h', 'MeanBP',
       'FiO2', '_future_label', '_label_observable'],
      dtype='object')


In [9]:
# =============================
# (추가) 학습 전: 학습 데이터/피처/전처리 산출물 추출
# - train/valid/test (전처리 완료 DF)
# - BoXHED 입력 행렬(X/y/w)
# - LSTM 입력(스케일러, 시퀀스/패딩)
# =============================

import os, json
from pathlib import Path

# 저장 위치
EXPORT_BASE = Path("./artifacts")
EXPORT_BASE.mkdir(parents=True, exist_ok=True)
RUN_TAG = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
OUT_DIR = EXPORT_BASE / f"run_{RUN_TAG}"
OUT_DIR.mkdir(parents=True, exist_ok=True)
print("Export dir:", OUT_DIR.resolve())

# 1) 전처리 완료 데이터프레임 저장 (재현/검증 목적)
#   - 용량이 크면 parquet 권장. (로컬에서 parquet 엔진 미설치 시 csv로 대체)

def _safe_to_parquet(df, path_base: Path):
    try:
        df.to_parquet(path_base.with_suffix('.parquet'), index=False)
        return str(path_base.with_suffix('.parquet'))
    except Exception as e:
        csv_path = path_base.with_suffix('.csv')
        df.to_csv(csv_path, index=False)
        print(f"[WARN] parquet 저장 실패({e}). CSV로 저장: {csv_path}")
        return str(csv_path)

saved_train = _safe_to_parquet(train_df, OUT_DIR / "train_processed")
saved_valid = _safe_to_parquet(valid_df, OUT_DIR / "valid_processed")
saved_test  = _safe_to_parquet(test_df,  OUT_DIR / "test_processed")
print("Saved processed DFs:", saved_train, saved_valid, saved_test)

# 2) 피처/설정 메타데이터 저장
meta = {
    "ID_COL": ID_COL,
    "TIME_COL": TIME_COL,
    "EVENT_COL": EVENT_COL,
    "WEIGHT_COL": WEIGHT_COL,
    "LABEL_COL": LABEL_COL,
    "FEATURE_COLS": FEATURE_COLS,
    "HORIZON_HOURS": HORIZON_HOURS,
    "DROP_AFTER_EVENT": DROP_AFTER_EVENT,
    "CUTOFF_HOURS": CUTOFF_HOURS,
    "AGG_MODE": AGG_MODE,
    "TARGET_RECALL": TARGET_RECALL,
}
(Path(OUT_DIR) / "meta.json").write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
print("Saved meta.json")

# 3) BoXHED 학습 입력 저장
X_train_b = train_df[FEATURE_COLS].to_numpy(dtype=np.float32)
y_train_b = train_df[LABEL_COL].to_numpy(dtype=np.int64).reshape(-1)

w_train_b = None
if WEIGHT_COL is not None and WEIGHT_COL in train_df.columns:
    w_train_b = train_df[WEIGHT_COL].to_numpy(dtype=np.float32).reshape(-1)

# 클래스 가중 적용 전/후 모두 저장 (재현성)
w_train2 = apply_class_weight(w_train_b, y_train_b)

np.savez_compressed(
    OUT_DIR / "boxhed_train_inputs.npz",
    X=X_train_b,
    y=y_train_b,
    w_raw=(w_train_b if w_train_b is not None else np.array([], dtype=np.float32)),
    w_applied=w_train2,
)
print("Saved boxhed_train_inputs.npz")

# # 4) LSTM 전처리 산출물 저장
# #   - scaler: train fit 후 valid/test transform에 사용
# #   - (옵션) 패딩된 X/y/sample_weight 저장 (용량이 커질 수 있음)

# import joblib
# joblib.dump(scaler, OUT_DIR / "lstm_scaler.joblib")
# print("Saved lstm_scaler.joblib")

# SAVE_LSTM_ARRAYS = False  # 필요 시 True로 변경(대용량)
# if SAVE_LSTM_ARRAYS:
#     np.savez_compressed(
#         OUT_DIR / "lstm_arrays.npz",
#         X_tr=X_tr,
#         y_tr=y_tr,
#         sw_tr=sw_tr,
#         X_va=X_va,
#         y_va=y_va,
#         sw_va=sw_va,
#         X_te=X_te,
#         y_te=y_te,
#     )
#     print("Saved lstm_arrays.npz")

# print("[DONE] Pre-training artifact export")


Export dir: /content/artifacts/run_20260117_075449
Saved processed DFs: artifacts/run_20260117_075449/train_processed.parquet artifacts/run_20260117_075449/valid_processed.parquet artifacts/run_20260117_075449/test_processed.parquet
Saved meta.json
Saved boxhed_train_inputs.npz


In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ===============================
# Evaluation helpers (SELF-CONTAINED)
# ===============================
import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score

def safe_auc_ap(y_true, y_score):
    y_true = np.asarray(y_true).astype(int)
    y_score = np.asarray(y_score).astype(float)
    if len(np.unique(y_true)) < 2:
        return float('nan'), float('nan')
    return float(roc_auc_score(y_true, y_score)), float(average_precision_score(y_true, y_score))

def metrics_at_threshold(y_true, y_score, thr):
    y_true = np.asarray(y_true).astype(int)
    y_score = np.asarray(y_score).astype(float)
    y_pred = (y_score >= float(thr)).astype(int)
    tp = int(((y_pred==1) & (y_true==1)).sum())
    fp = int(((y_pred==1) & (y_true==0)).sum())
    tn = int(((y_pred==0) & (y_true==0)).sum())
    fn = int(((y_pred==0) & (y_true==1)).sum())
    prec = tp / (tp + fp) if (tp+fp)>0 else 0.0
    rec  = tp / (tp + fn) if (tp+fn)>0 else 0.0
    return {
        'tp': tp, 'fp': fp, 'tn': tn, 'fn': fn,
        'precision': float(prec),
        'recall': float(rec),
    }

def pick_threshold_by_recall_then_precision(y_true, y_score, target_recall=0.8, n_grid=1001):
    y_true = np.asarray(y_true).astype(int)
    y_score = np.asarray(y_score).astype(float)
    # If all scores equal, threshold is that value
    lo, hi = float(np.nanmin(y_score)), float(np.nanmax(y_score))
    if not np.isfinite(lo) or not np.isfinite(hi) or lo==hi:
        thr = lo if np.isfinite(lo) else 0.5
        m = metrics_at_threshold(y_true, y_score, thr)
        return float(thr), m['precision'], m['recall']
    thrs = np.linspace(lo, hi, n_grid)
    best = None
    for thr in thrs:
        m = metrics_at_threshold(y_true, y_score, thr)
        if m['recall'] + 1e-12 >= float(target_recall):
            cand = (m['precision'], -thr, thr, m['recall'])
            if best is None or cand > best[0]:
                best = (cand, thr, m['precision'], m['recall'])
    if best is None:
        # can't reach target recall -> pick max recall then precision
        best2=None
        for thr in thrs:
            m=metrics_at_threshold(y_true,y_score,thr)
            cand=(m['recall'], m['precision'], -thr, thr)
            if best2 is None or cand>best2[0]:
                best2=(cand, thr, m['precision'], m['recall'])
        _,thr,prec,rec=best2
        return float(thr), float(prec), float(rec)
    _,thr,prec,rec=best
    return float(thr), float(prec), float(rec)



In [None]:
# =============================
# 1) BoXHED 학습/예측 (목표 정합 버전)
# - 학습 타깃: LABEL_COL (= (t, t+H] 미래 이벤트)
# - 예측: hazard -> sigmoid(hazard) 를 row_score로 사용
# =============================

# BoXHED 학습 데이터
X_train_b = train_df[FEATURE_COLS].to_numpy(dtype=np.float32)
y_train_b = train_df[LABEL_COL].to_numpy(dtype=np.int64).reshape(-1)

w_train_b = None
if WEIGHT_COL is not None and WEIGHT_COL in train_df.columns:
    w_train_b = train_df[WEIGHT_COL].to_numpy(dtype=np.float32).reshape(-1)

w_train2 = apply_class_weight(w_train_b, y_train_b)

boxhed_params = dict(max_depth=3, n_estimators=200, eta=0.05, min_child_events=1)
print("BoXHED params:", boxhed_params)

boxhed_model = boxhed(gpu_id=-1, nthread=8, **boxhed_params)
boxhed_model.fit(X_train_b, y_train_b, w_train2)

# 평가 DF 기준 row_score 생성
valid_row_boxhed = predict_row_score_boxhed_df(boxhed_model, valid_df, FEATURE_COLS)
test_row_boxhed  = predict_row_score_boxhed_df(boxhed_model, test_df,  FEATURE_COLS)

# stay-level 평가
yv_b, sv_b = stay_score_with_cutoff(valid_df, valid_row_boxhed, cutoff_hours=CUTOFF_HOURS, agg=AGG_MODE)
yt_b, st_b = stay_score_with_cutoff(test_df,  test_row_boxhed,  cutoff_hours=CUTOFF_HOURS, agg=AGG_MODE)

thr_b, _, _ = pick_threshold_by_recall_then_precision(yv_b, sv_b, target_recall=TARGET_RECALL)
auc_v_b, ap_v_b = safe_auc_ap(yv_b, sv_b)
auc_t_b, ap_t_b = safe_auc_ap(yt_b, st_b)

m_v_b = metrics_at_threshold(yv_b, sv_b, thr_b)
m_t_b = metrics_at_threshold(yt_b, st_b, thr_b)

print("[BoXHED][VALID] AUC=%.4f AP=%.4f thr=%.4f P=%.4f R=%.4f" % (auc_v_b, ap_v_b, thr_b, m_v_b["precision"], m_v_b["recall"]))
print("[BoXHED][TEST ] AUC=%.4f AP=%.4f thr=%.4f P=%.4f R=%.4f" % (auc_t_b, ap_t_b, thr_b, m_t_b["precision"], m_t_b["recall"]))


In [None]:
# =============================
# 2) LSTM 학습/예측 (row_score) - GPU enabled, 조건 동일
# =============================

import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler

# -------------------------------------------------
# GPU / cuDNN / XLA / mixed precision 설정 (조건 변경 아님)
# -------------------------------------------------
print("TF version:", tf.__version__)
print("Available GPUs:", tf.config.list_physical_devices('GPU'))

# XLA (그래프 최적화)
tf.config.optimizer.set_jit(True)

# mixed precision (GPU에서 속도 크게 개선, 연산 의미는 동일)
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')
print("Compute policy:", mixed_precision.global_policy())

# -------------------------------------------------
# 기존 코드 (로직/조건 동일)
# -------------------------------------------------
PAD_VALUE = -999.0
MAX_LEN = 120  # 시퀀스 최대 길이

def make_sequences(df_long, feature_cols, id_col=ID_COL, time_col=TIME_COL):
    df_sorted = df_long.sort_values([id_col, time_col]).copy()
    X_list, y_list, idx_list = [], [], []
    for sid, g in df_sorted.groupby(id_col, sort=False):
        idx = g.index.to_numpy()
        X = g[feature_cols].to_numpy(dtype=np.float32)
        y = g[LABEL_COL].to_numpy(dtype=np.float32).reshape(-1, 1)
        X_list.append(X)
        y_list.append(y)
        idx_list.append(idx)
    return X_list, y_list, idx_list

def pad_3d(X_list, max_len, pad_value=PAD_VALUE):
    F = X_list[0].shape[1]
    X_pad = np.full((len(X_list), max_len, F), pad_value, dtype=np.float32)
    mask = np.zeros((len(X_list), max_len), dtype=bool)
    for i, x in enumerate(X_list):
        L = min(x.shape[0], max_len)
        X_pad[i, :L, :] = x[:L]
        mask[i, :L] = True
    return X_pad, mask

def pad_y(y_list, max_len):
    y_pad = np.zeros((len(y_list), max_len, 1), dtype=np.float32)
    for i, y in enumerate(y_list):
        L = min(y.shape[0], max_len)
        y_pad[i, :L, :] = y[:L]
    return y_pad

def build_lstm_timewise(input_shape, pad_value=PAD_VALUE):
    inp = layers.Input(shape=input_shape)
    x = layers.Masking(mask_value=pad_value)(inp)

    # cuDNN LSTM (GPU 자동 사용)
    x = layers.LSTM(64, return_sequences=True)(x)
    x = layers.Dropout(0.2)(x)
    x = layers.LSTM(32, return_sequences=True)(x)

    # mixed precision 사용 시 출력은 float32로 고정 (수치 안정성)
    out = layers.TimeDistributed(
        layers.Dense(1, activation='sigmoid', dtype='float32')
    )(x)

    model = models.Model(inp, out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss='binary_crossentropy',
        metrics=[tf.keras.metrics.AUC(name='auc')]
    )
    return model

# -------------------------------------------------
# scaling (train fit, valid/test transform)
# -------------------------------------------------
scaler = StandardScaler()
scaler.fit(train_df[FEATURE_COLS].to_numpy(dtype=np.float32))

def scaled_copy(df):
    out = df.copy()
    out[FEATURE_COLS] = scaler.transform(out[FEATURE_COLS].to_numpy(dtype=np.float32))
    return out

tr = scaled_copy(train_df)
va = scaled_copy(valid_df)
te = scaled_copy(test_df)

# -------------------------------------------------
# sequence build + padding
# -------------------------------------------------
X_tr_list, y_tr_list, idx_tr_list = make_sequences(tr, FEATURE_COLS)
X_va_list, y_va_list, idx_va_list = make_sequences(va, FEATURE_COLS)
X_te_list, y_te_list, idx_te_list = make_sequences(te, FEATURE_COLS)

X_tr, m_tr = pad_3d(X_tr_list, MAX_LEN)
X_va, m_va = pad_3d(X_va_list, MAX_LEN)
X_te, m_te = pad_3d(X_te_list, MAX_LEN)

y_tr = pad_y(y_tr_list, MAX_LEN)
y_va = pad_y(y_va_list, MAX_LEN)
y_te = pad_y(y_te_list, MAX_LEN)

# padding 구간 loss 제외
sw_tr = m_tr.astype(np.float32)
sw_va = m_va.astype(np.float32)

# -------------------------------------------------
# 학습 (epochs / batch_size / patience 그대로)
# -------------------------------------------------
lstm_model = build_lstm_timewise((MAX_LEN, len(FEATURE_COLS)))
es = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc',
    mode='max',
    patience=8,
    restore_best_weights=True
)

lstm_model.fit(
    X_tr, y_tr,
    validation_data=(X_va, y_va, sw_va),
    sample_weight=sw_tr,
    epochs=50,
    batch_size=256,
    callbacks=[es],
    verbose=1
)

# -------------------------------------------------
# row_score 계산 (로직 동일)
# -------------------------------------------------
def predict_row_score_lstm(model, df_long_scaled, X_list, idx_list, max_len=MAX_LEN):
    X_pad, m = pad_3d(X_list, max_len)
    yhat = model.predict(X_pad, verbose=0).reshape(len(X_list), max_len)

    row_score = np.full((len(df_long_scaled),), np.nan, dtype=np.float32)
    pos = {idx:i for i,idx in enumerate(df_long_scaled.index.to_numpy())}

    for i, idx in enumerate(idx_list):
        L = min(len(idx), max_len)
        for j in range(L):
            row_score[pos[idx[j]]] = yhat[i, j]
    return row_score

# long df는 원본 index 유지
tr_sorted = tr.sort_values([ID_COL, TIME_COL])
va_sorted = va.sort_values([ID_COL, TIME_COL])
te_sorted = te.sort_values([ID_COL, TIME_COL])

valid_row_lstm = predict_row_score_lstm(lstm_model, va_sorted, X_va_list, idx_va_list)
test_row_lstm  = predict_row_score_lstm(lstm_model, te_sorted, X_te_list, idx_te_list)

# -------------------------------------------------
# stay-level 평가
# -------------------------------------------------
yv_l, sv_l = stay_score_with_cutoff(va_sorted, valid_row_lstm, CUTOFF_HOURS, agg=AGG_MODE)
yt_l, st_l = stay_score_with_cutoff(te_sorted, test_row_lstm,  CUTOFF_HOURS, agg=AGG_MODE)

thr_l, p_l, r_l = pick_threshold_by_recall_then_precision(
    yv_l, sv_l, target_recall=TARGET_RECALL
)
auc_v_l, ap_v_l = safe_auc_ap(yv_l, sv_l)
auc_t_l, ap_t_l = safe_auc_ap(yt_l, st_l)

m_v_l = metrics_at_threshold(yv_l, sv_l, thr_l)
m_t_l = metrics_at_threshold(yt_l, st_l, thr_l)

print("[LSTM][VALID] AUC=%.4f AP=%.4f thr=%.4f P=%.4f R=%.4f"
      % (auc_v_l, ap_v_l, thr_l, m_v_l['precision'], m_v_l['recall']))
print("[LSTM][TEST ] AUC=%.4f AP=%.4f thr=%.4f P=%.4f R=%.4f"
      % (auc_t_l, ap_t_l, thr_l, m_t_l['precision'], m_t_l['recall']))


In [None]:
# =============================
# (추가) 학습 후: 모델/추론 아티팩트 추출
# - BoXHED 모델 저장
# - LSTM 모델 저장
# - (옵션) 임계값(threshold) 및 평가 요약 저장
# =============================

import json
from pathlib import Path
import joblib

# pre-export 셀에서 만든 OUT_DIR을 재사용 (없으면 새로 생성)
try:
    OUT_DIR
except NameError:
    EXPORT_BASE = Path("./artifacts")
    EXPORT_BASE.mkdir(parents=True, exist_ok=True)
    RUN_TAG = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
    OUT_DIR = EXPORT_BASE / f"run_{RUN_TAG}"
    OUT_DIR.mkdir(parents=True, exist_ok=True)

print("Export dir:", OUT_DIR.resolve())

# 1) BoXHED 모델 저장 (가능한 포맷을 순차 시도)
boxhed_saved = None
try:
    if hasattr(boxhed_model, "save_model"):
        path = OUT_DIR / "boxhed_model.json"
        boxhed_model.save_model(str(path))
        boxhed_saved = str(path)
    else:
        raise AttributeError("save_model not found")
except Exception as e:
    # fallback: pickle/joblib
    try:
        path = OUT_DIR / "boxhed_model.pkl"
        joblib.dump(boxhed_model, path)
        boxhed_saved = str(path)
    except Exception as e2:
        print("[ERROR] BoXHED 모델 저장 실패:", e, e2)

print("Saved BoXHED model:", boxhed_saved)

# 2) LSTM 모델 저장
lstm_saved = None
try:
    # Keras 3 권장 포맷: .keras
    path = OUT_DIR / "lstm_model.keras"
    lstm_model.save(path)
    lstm_saved = str(path)
except Exception as e:
    # fallback: SavedModel
    try:
        path = OUT_DIR / "lstm_savedmodel"
        lstm_model.save(path)
        lstm_saved = str(path)
    except Exception as e2:
        print("[ERROR] LSTM 모델 저장 실패:", e, e2)

print("Saved LSTM model:", lstm_saved)

# 3) 운영/재현용 파라미터 저장 (threshold 등)
extra = {
    "boxhed_threshold": float(thr_b) if 'thr_b' in globals() else None,
    "lstm_threshold":  float(thr_l) if 'thr_l' in globals() else None,
    "boxhed_params":   boxhed_params if 'boxhed_params' in globals() else None,
    "MAX_LEN":         int(MAX_LEN) if 'MAX_LEN' in globals() else None,
    "PAD_VALUE":       float(PAD_VALUE) if 'PAD_VALUE' in globals() else None,
}
(Path(OUT_DIR) / "trained_artifacts.json").write_text(json.dumps(extra, ensure_ascii=False, indent=2), encoding="utf-8")
print("Saved trained_artifacts.json")

print("[DONE] Post-training model export")


In [None]:
# =============================
# 3) 결과 비교 요약
# =============================

summary = pd.DataFrame([
    {
        "model": "BoXHED",
        "valid_auc": auc_v_b, "valid_ap": ap_v_b,
        "valid_thr": thr_b, "valid_precision": m_v_b["precision"], "valid_recall": m_v_b["recall"],
        "test_auc": auc_t_b, "test_ap": ap_t_b,
        "test_precision": m_t_b["precision"], "test_recall": m_t_b["recall"],
    },
    {
        "model": "LSTM",
        "valid_auc": auc_v_l, "valid_ap": ap_v_l,
        "valid_thr": thr_l, "valid_precision": m_v_l["precision"], "valid_recall": m_v_l["recall"],
        "test_auc": auc_t_l, "test_ap": ap_t_l,
        "test_precision": m_t_l["precision"], "test_recall": m_t_l["recall"],
    }
])

summary


In [None]:

# =============================
# 4) 누수/정합 빠른 점검
# =============================

def check_disjoint(a, b, name_a, name_b, id_col=ID_COL):
    ia, ib = set(a[id_col].unique()), set(b[id_col].unique())
    inter = ia & ib
    print(f"{name_a} stays: {len(ia):,} | {name_b} stays: {len(ib):,} | overlap: {len(inter):,}")
    if inter:
        # 일부만 출력
        sample = list(inter)[:10]
        raise ValueError(f"DATA LEAK: {name_a} and {name_b} share stay_id. sample={sample}")

check_disjoint(train_df, valid_df, "train", "valid")
check_disjoint(train_df, test_df,  "train", "test")
check_disjoint(valid_df, test_df,  "valid", "test")

# 라벨/이벤트 이후 제거 확인
print("LABEL_COL=", LABEL_COL, "EVENT_COL=", EVENT_COL)
print("Future-label positive rate (train/valid/test):",
      train_df[LABEL_COL].mean().round(4),
      valid_df[LABEL_COL].mean().round(4),
      test_df[LABEL_COL].mean().round(4))

# 이벤트 이후 제거가 동작했는지(이벤트가 있는 stay에서 t < first_event_t만 남는지) 샘플 체크
if DROP_AFTER_EVENT:
    tmp = test_df.copy()
    first_t = tmp.loc[tmp[EVENT_COL].astype(int)==1].groupby(ID_COL)[TIME_COL].min()
    if len(first_t)>0:
        # 제거 후에는 event row 자체가 없어야 정상 (t >= first_event_t 제거)
        assert tmp[EVENT_COL].sum() == 0, "DROP_AFTER_EVENT=True인데 event==1 row가 남아있습니다."
        print("Post-event rows removed: OK (no event rows remain in filtered splits)")


In [None]:
# === Leakage-Defense Checks (heuristics) ===
# 목적:
# 1) 피처명에 outcome/future/label 등 누수 토큰이 포함된 경우 경고
# 2) 시간 누수 가능성이 큰 "정산형/누적형" 피처(cum/total/elapsed 등) 후보를 목록화
#
# 주의: 이 셀은 "누수 가능성"을 자동으로 찾아주는 휴리스틱입니다.
#      경고가 뜬다고 100% 누수라는 뜻은 아니고, 사람이 확인해야 합니다.

import re
import numpy as np
import pandas as pd

def _g(name):
    return globals().get(name, None)

train_df = _g('train_df')
FEATURE_COLS = _g('FEATURE_COLS')

if train_df is None or FEATURE_COLS is None:
    print('Leakage-Defense Checks: train_df / FEATURE_COLS 가 아직 정의되지 않았습니다. (위 셀을 먼저 실행하세요)')
else:
    feature_cols = list(FEATURE_COLS)

    # ---- 1) Name-based leakage token scan ----
    # 이름에 아래 토큰이 포함되어 있으면 결과/미래/라벨 유입 가능성이 있습니다.
    # 정상 피처일 수도 있지만, 반드시 사람이 확인하세요.
    leak_tokens = [
        r'label', r'target', r'outcome', r'event', r'delta',
        r'mort', r'death', r'expire', r'discharge', r'dod',
        r'future', r'next', r'horizon', r'\btt[e|d]\b',
        r'readmit', r'\blos\b', r'length[_\s]*of[_\s]*stay',
        r'\by\b$', r'\by_', r'_y$',
    ]
    leak_pat = re.compile('|'.join(leak_tokens), flags=re.IGNORECASE)
    name_flagged = [c for c in feature_cols if leak_pat.search(str(c))]

    print('\n[1] Name-based leakage token scan')
    if name_flagged:
        print(f'  ⚠️  Suspicious feature name(s): {len(name_flagged)}')
        for c in name_flagged[:80]:
            print('   -', c)
        if len(name_flagged) > 80:
            print(f'   ...(and {len(name_flagged)-80} more)')
        print('  Action: 해당 피처가 시점 t에서 이용 가능한 정보만으로 계산됐는지 확인하세요.')
    else:
        print('  ✅ No suspicious tokens found in FEATURE_COLS')

    # ---- 2) Accumulator-style feature name scan ----
    acc_tokens = [
        r'cum', r'cumulative', r'total', r'sum', r'count',
        r'elapsed', r'since', r'duration', r'time_since',
        r'ever_', r'history', r'lifetime', r'to_date',
        r'max_ever', r'min_ever',
    ]
    acc_pat = re.compile('|'.join(acc_tokens), flags=re.IGNORECASE)
    acc_flagged = [c for c in feature_cols if acc_pat.search(str(c))]

    print('\n[2] Accumulator-style feature name scan')
    if acc_flagged:
        print(f'  ⚠️  Accumulator-like feature name(s): {len(acc_flagged)}')
        for c in acc_flagged[:80]:
            print('   -', c)
        if len(acc_flagged) > 80:
            print(f'   ...(and {len(acc_flagged)-80} more)')
        print('  Action: 누적/정산 피처가 "t 이전 정보만" 사용해 계산됐는지 확인하세요.')
    else:
        print('  ✅ No accumulator-like tokens found in FEATURE_COLS')

    # ---- 3) Monotonicity heuristic within stay (numeric only) ----
    print('\n[3] Monotonicity heuristic within stay (numeric only)')
    if 'stay_id' not in train_df.columns or 't' not in train_df.columns:
        print('  (skip) stay_id / t 컬럼이 없어 단조성 점검을 생략합니다.')
    else:
        df_m = train_df[['stay_id','t'] + [c for c in feature_cols if c in train_df.columns]].copy()
        # sample stays for speed
        try:
            uniq = df_m['stay_id'].unique()
            if len(uniq) > 2000:
                sampled = np.random.choice(uniq, size=2000, replace=False)
                df_m = df_m[df_m['stay_id'].isin(sampled)]
        except Exception:
            pass

        num_cols = [c for c in feature_cols if c in df_m.columns and pd.api.types.is_numeric_dtype(df_m[c])]
        if not num_cols:
            print('  (skip) numeric feature column이 없어 단조성 점검을 생략합니다.')
        else:
            df_m = df_m.sort_values(['stay_id','t'])

            def _monotone_score(g, col):
                x = g[col].to_numpy(dtype=float)
                x = x[~np.isnan(x)]
                if len(x) < 3:
                    return np.nan
                dx = np.diff(x)
                inc = np.mean(dx >= -1e-8)
                dec = np.mean(dx <=  1e-8)
                return float(max(inc, dec))

            eval_cols = num_cols[:300]  # cap for speed
            rows = []
            gb = df_m.groupby('stay_id', sort=False)
            for col in eval_cols:
                s = gb.apply(lambda g: _monotone_score(g, col)).mean()
                rows.append((col, float(s)))

            mono = pd.DataFrame(rows, columns=['feature','mean_monotone_score']).sort_values('mean_monotone_score', ascending=False)
            strong = mono[mono['mean_monotone_score'] >= 0.98]
            if strong.empty:
                print('  ✅ No strong monotone candidates found (threshold: 0.98)')
                print('  Note: 이건 휴리스틱이라, 누수가 없다는 의미는 아닙니다.')
            else:
                print(f'  ⚠️  Strong monotone candidates (>=0.98): {len(strong)}')
                try:
                    display(strong.head(30))
                except Exception:
                    print(strong.head(30).to_string(index=False))
                print('  Action: 해당 피처가 누적/경과시간/정산형이면, 계산에 미래값이 섞이지 않았는지 확인하세요.')


In [None]:
# =============================
# LSTM helpers for Horizon sweep (self-contained)
# - Cell 14에서 NameError 없이 동작하도록 함수 형태로 제공
# =============================

from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras import layers, models

PAD_VALUE = -999.0

def _make_sequences(df_long, feature_cols, id_col, time_col, label_col):
    df_sorted = df_long.sort_values([id_col, time_col]).copy()
    X_list, y_list, idx_list = [], [], []
    for _, g in df_sorted.groupby(id_col, sort=False):
        idx = g.index.to_numpy()
        X = g[feature_cols].to_numpy(dtype=np.float32)
        y = g[label_col].to_numpy(dtype=np.float32).reshape(-1, 1)
        X_list.append(X); y_list.append(y); idx_list.append(idx)
    return df_sorted, X_list, y_list, idx_list

def _pad_3d(X_list, max_len, pad_value=PAD_VALUE):
    F = X_list[0].shape[1]
    X_pad = np.full((len(X_list), max_len, F), pad_value, dtype=np.float32)
    mask = np.zeros((len(X_list), max_len), dtype=bool)
    for i, x in enumerate(X_list):
        L = min(x.shape[0], max_len)
        X_pad[i, :L, :] = x[:L]
        mask[i, :L] = True
    return X_pad, mask

def _pad_y(y_list, max_len):
    y_pad = np.zeros((len(y_list), max_len, 1), dtype=np.float32)
    for i, y in enumerate(y_list):
        L = min(y.shape[0], max_len)
        y_pad[i, :L, :] = y[:L]
    return y_pad

def build_lstm_timewise(input_shape, pad_value=PAD_VALUE):
    inp = layers.Input(shape=input_shape)
    x = layers.Masking(mask_value=pad_value)(inp)
    x = layers.LSTM(64, return_sequences=True)(x)
    x = layers.Dropout(0.2)(x)
    x = layers.LSTM(32, return_sequences=True)(x)
    out = layers.TimeDistributed(layers.Dense(1, activation='sigmoid'))(x)
    m = models.Model(inp, out)
    m.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
              loss='binary_crossentropy',
              metrics=[tf.keras.metrics.AUC(name='auc')])
    return m

def train_lstm_row_model(train_df, valid_df, feature_cols, id_col, time_col, label_col, max_len=120):
    scaler = StandardScaler()
    scaler.fit(train_df[feature_cols].to_numpy(dtype=np.float32))

    tr = train_df.copy(); va = valid_df.copy()
    tr[feature_cols] = scaler.transform(tr[feature_cols].to_numpy(dtype=np.float32))
    va[feature_cols] = scaler.transform(va[feature_cols].to_numpy(dtype=np.float32))

    _, X_tr_list, y_tr_list, _ = _make_sequences(tr, feature_cols, id_col, time_col, label_col)
    _, X_va_list, y_va_list, _ = _make_sequences(va, feature_cols, id_col, time_col, label_col)

    X_tr, m_tr = _pad_3d(X_tr_list, max_len)
    X_va, m_va = _pad_3d(X_va_list, max_len)
    y_tr = _pad_y(y_tr_list, max_len)
    y_va = _pad_y(y_va_list, max_len)

    sw_tr = m_tr.astype(np.float32)
    sw_va = m_va.astype(np.float32)

    model = build_lstm_timewise((max_len, len(feature_cols)))
    es = tf.keras.callbacks.EarlyStopping(monitor='val_auc', mode='max', patience=5, restore_best_weights=True, verbose=0)
    model.fit(X_tr, y_tr, validation_data=(X_va, y_va, sw_va), sample_weight=sw_tr,
              epochs=30, batch_size=256, callbacks=[es], verbose=0)
    return model, scaler

def predict_row_score_lstm(model, scaler, df_eval, feature_cols, id_col, time_col, max_len=120):
    # row_score는 df_eval '원본 행 순서'에 정렬되도록 반환
    assert df_eval.index.is_unique, 'df_eval index must be unique for safe mapping'
    df_scaled = df_eval.copy()
    df_scaled[feature_cols] = scaler.transform(df_scaled[feature_cols].to_numpy(dtype=np.float32))

    df_sorted, X_list, _, idx_list = _make_sequences(df_scaled, feature_cols, id_col, time_col, label_col=LABEL_COL)
    X_pad, _ = _pad_3d(X_list, max_len)
    yhat = model.predict(X_pad, verbose=0).reshape(len(X_list), max_len)

    pos = {idx: i for i, idx in enumerate(df_eval.index.to_numpy())}
    row_score = np.full((len(df_eval),), np.nan, dtype=np.float32)
    for i, idx in enumerate(idx_list):
        L = min(len(idx), max_len)
        for j in range(L):
            row_score[pos[idx[j]]] = yhat[i, j]
    if np.isnan(row_score).any():
        # padding 구간 외 NaN이 남으면 매핑 실패 가능성
        pass
    return row_score


In [None]:
# =============================
# 5) Horizon별 성능 곡선 (BoXHED vs LSTM) - 목표 정합 버전
# =============================

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

HORIZONS_TO_EVAL = [6, 12, 24]  # hours

def _load_raw():
    tr = pd.read_csv(train_path)
    va = pd.read_csv(valid_path)
    te = pd.read_csv(test_path)
    return tr, va, te

def _prep_eval_dfs(tr_raw, va_raw, te_raw, horizon_hours: int):
    # 1) 미래 라벨 생성
    tr = add_future_label(tr_raw.copy(), horizon_hours=horizon_hours)
    va = add_future_label(va_raw.copy(), horizon_hours=horizon_hours)
    te = add_future_label(te_raw.copy(), horizon_hours=horizon_hours)

    # 2) 검열 마스크 → 관측 불가능 row 제거
    tr = add_label_observable_mask(tr, horizon_hours=horizon_hours)
    va = add_label_observable_mask(va, horizon_hours=horizon_hours)
    te = add_label_observable_mask(te, horizon_hours=horizon_hours)

    tr = tr.loc[tr["_label_observable"]].copy()
    va = va.loc[va["_label_observable"]].copy()
    te = te.loc[te["_label_observable"]].copy()

    # 3) 이벤트 이후 제거
    if DROP_AFTER_EVENT:
        tr = drop_rows_after_first_event(tr)
        va = drop_rows_after_first_event(va)
        te = drop_rows_after_first_event(te)

    # 4) 옵션: train cutoff
    if APPLY_CUTOFF_TO_TRAIN:
        tr = tr.loc[tr[TIME_COL] <= CUTOFF_HOURS].copy()

    return tr, va, te

def _feature_cols_from(df: pd.DataFrame):
    # 미정의 변수(DELTA_COL 등) 제거: 고정 drop set 사용
    drop = {ID_COL, TIME_COL, LABEL_COL, "event", "delta"}
    if WEIGHT_COL is not None:
        drop.add(WEIGHT_COL)
    # 파생 지표 제외(노트북 기본 의도 유지)
    drop |= {"calc_DiasBP", "ShockIndex", "PulsePressure", "ModShockIndex", "ROX_Index"}
    cols = [c for c in df.columns if c not in drop]
    if len(cols) == 0:
        raise RuntimeError("No feature columns after dropping.")
    return cols

def _run_boxhed_one(tr_eval, va_eval, te_eval, feature_cols):
    X_tr = tr_eval[feature_cols].to_numpy(dtype=np.float32)
    y_tr = tr_eval[LABEL_COL].to_numpy(dtype=np.int64).reshape(-1)

    w_tr = None
    if WEIGHT_COL is not None and WEIGHT_COL in tr_eval.columns:
        w_tr = tr_eval[WEIGHT_COL].to_numpy(dtype=np.float32).reshape(-1)
    w_tr2 = apply_class_weight(w_tr, y_tr)

    params = dict(max_depth=3, n_estimators=200, eta=0.05, min_child_events=1)
    model = boxhed(gpu_id=-1, nthread=8, **params)
    model.fit(X_tr, y_tr, w_tr2)

    v_row = predict_row_score_boxhed_df(model, va_eval, feature_cols)
    t_row = predict_row_score_boxhed_df(model, te_eval, feature_cols)
    return v_row, t_row

def _run_lstm_one(tr_eval, va_eval, te_eval, feature_cols):
    lstm_model, scaler = train_lstm_row_model(
        train_df=tr_eval,
        valid_df=va_eval,
        feature_cols=feature_cols,
        id_col=ID_COL,
        time_col=TIME_COL,
        label_col=LABEL_COL,
        max_len=MAX_LEN,
    )
    v_row = predict_row_score_lstm(lstm_model, scaler, va_eval, feature_cols, id_col=ID_COL, time_col=TIME_COL, max_len=MAX_LEN)
    t_row = predict_row_score_lstm(lstm_model, scaler, te_eval, feature_cols, id_col=ID_COL, time_col=TIME_COL, max_len=MAX_LEN)
    return v_row, t_row

def _eval_one(df_eval, row_score):
    return stay_score_with_cutoff(df_eval, row_score, cutoff_hours=CUTOFF_HOURS, agg=AGG_MODE)

rows = []
for H in HORIZONS_TO_EVAL:
    print(f"===== Horizon {H}h =====")
    tr_raw, va_raw, te_raw = _load_raw()
    tr_eval, va_eval, te_eval = _prep_eval_dfs(tr_raw, va_raw, te_raw, H)

    # split 누수 체크
    check_disjoint(tr_eval, va_eval, "train", "valid")
    check_disjoint(tr_eval, te_eval, "train", "test")
    check_disjoint(va_eval, te_eval, "valid", "test")

    feature_cols = _feature_cols_from(tr_eval)

    # --- BoXHED ---
    v_row_b, t_row_b = _run_boxhed_one(tr_eval, va_eval, te_eval, feature_cols)
    yv_b, sv_b = _eval_one(va_eval, v_row_b)
    yt_b, st_b = _eval_one(te_eval, t_row_b)
    thr_b, _, _ = pick_threshold_by_recall_then_precision(yv_b, sv_b, target_recall=TARGET_RECALL)
    auc_v_b, ap_v_b = safe_auc_ap(yv_b, sv_b)
    auc_t_b, ap_t_b = safe_auc_ap(yt_b, st_b)
    mv_b = metrics_at_threshold(yv_b, sv_b, thr_b)
    mt_b = metrics_at_threshold(yt_b, st_b, thr_b)

    rows.append(dict(model="BoXHED", horizon_hours=H,
                     valid_auc=auc_v_b, valid_ap=ap_v_b, valid_precision=mv_b["precision"], valid_recall=mv_b["recall"],
                     test_auc=auc_t_b,  test_ap=ap_t_b,  test_precision=mt_b["precision"],  test_recall=mt_b["recall"]))

    # --- LSTM ---
    v_row_l, t_row_l = _run_lstm_one(tr_eval, va_eval, te_eval, feature_cols)
    yv_l, sv_l = _eval_one(va_eval, v_row_l)
    yt_l, st_l = _eval_one(te_eval, t_row_l)
    thr_l, _, _ = pick_threshold_by_recall_then_precision(yv_l, sv_l, target_recall=TARGET_RECALL)
    auc_v_l, ap_v_l = safe_auc_ap(yv_l, sv_l)
    auc_t_l, ap_t_l = safe_auc_ap(yt_l, st_l)
    mv_l = metrics_at_threshold(yv_l, sv_l, thr_l)
    mt_l = metrics_at_threshold(yt_l, st_l, thr_l)

    rows.append(dict(model="LSTM", horizon_hours=H,
                     valid_auc=auc_v_l, valid_ap=ap_v_l, valid_precision=mv_l["precision"], valid_recall=mv_l["recall"],
                     test_auc=auc_t_l,  test_ap=ap_t_l,  test_precision=mt_l["precision"],  test_recall=mt_l["recall"]))

df_h = pd.DataFrame(rows)
display(df_h)

def _plot(metric, split="test"):
    plt.figure()
    for model_name in ["BoXHED", "LSTM"]:
        sub = df_h[df_h["model"] == model_name].sort_values("horizon_hours")
        plt.plot(sub["horizon_hours"], sub[f"{split}_{metric}"], marker="o", label=model_name)
    plt.xlabel("Horizon (hours)")
    plt.ylabel(f"{split}.{metric}")
    plt.title(f"{split}.{metric} vs Horizon")
    plt.grid(True)
    plt.legend()
    plt.show()

for m in ["auc", "ap", "precision", "recall"]:
    _plot(m, split="test")


## Time-dependent evaluation (row-level): AUC/AP by time t

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def _safe_auc_ap(y, s):
    y = np.asarray(y).astype(int)
    s = np.asarray(s).astype(float)
    if len(np.unique(y)) < 2:
        return np.nan, np.nan
    return safe_auc_ap(y, s)  # 노트북 기존 함수 사용

def time_slice_metrics(df_long, row_score, time_col=TIME_COL, label_col=LABEL_COL,
                       cutoff_hours=CUTOFF_HOURS, time_mode="exact_int"):
    """시간별(row-level) AUC/AP 계산."""
    d = df_long.copy()
    d["_row_score"] = np.asarray(row_score, dtype=np.float32)

    if time_mode == "exact_int":
        d["_tbin"] = d[time_col].astype(int)
    elif time_mode == "bin_floor":
        d["_tbin"] = np.floor(d[time_col].to_numpy(dtype=float)).astype(int)
    else:
        raise ValueError("time_mode must be one of {'exact_int','bin_floor'}")

    d = d.loc[(d["_tbin"] >= 0) & (d["_tbin"] <= cutoff_hours)].copy()

    rows = []
    for t in range(0, cutoff_hours + 1):
        sub = d.loc[d["_tbin"] == t]
        if len(sub) == 0:
            rows.append(dict(t=t, n=0, pos=0, auc=np.nan, ap=np.nan))
            continue
        y = sub[label_col].to_numpy(dtype=int)
        s = sub["_row_score"].to_numpy(dtype=float)
        auc, ap = _safe_auc_ap(y, s)
        rows.append(dict(t=t, n=len(sub), pos=int((y == 1).sum()), auc=auc, ap=ap))

    out = pd.DataFrame(rows)
    out["pos_rate"] = out["pos"] / out["n"].replace(0, np.nan)
    return out

def plot_compare(df_time, metric, split_name="TEST"):
    plt.figure()
    for model_name in ["BoXHED", "LSTM"]:
        sub = df_time[df_time["model"] == model_name].sort_values("t")
        plt.plot(sub["t"], sub[metric], marker="o", label=model_name)
    plt.xlabel("time t (hours)")
    plt.ylabel(metric)
    plt.title(f"{split_name} time-dependent {metric}")
    plt.grid(True)
    plt.legend()
    plt.show()


In [None]:
# ====== Time-dependent evaluation (row-level) ======
# BoXHED row scores (위에서 생성된 것)
v_b = valid_row_boxhed
t_b = test_row_boxhed

# LSTM row scores: 노트북 변수명이 다르면 아래 두 줄만 수정하세요.
# (일반적으로 valid_row_lstm / test_row_lstm 형태)
v_l = valid_row_lstm
t_l = test_row_lstm

# TIME_COL이 정수 시간이라면 "exact_int", 실수/불규칙이면 "bin_floor"
TIME_MODE = "exact_int"

# ---- VALID ----
m_v_b = time_slice_metrics(valid_df, v_b, cutoff_hours=CUTOFF_HOURS, time_mode=TIME_MODE)
m_v_l = time_slice_metrics(valid_df, v_l, cutoff_hours=CUTOFF_HOURS, time_mode=TIME_MODE)

m_v_b["model"] = "BoXHED"
m_v_l["model"] = "LSTM"
df_time_valid = pd.concat([m_v_b, m_v_l], axis=0, ignore_index=True)
display(df_time_valid)

# ---- TEST ----
m_t_b = time_slice_metrics(test_df, t_b, cutoff_hours=CUTOFF_HOURS, time_mode=TIME_MODE)
m_t_l = time_slice_metrics(test_df, t_l, cutoff_hours=CUTOFF_HOURS, time_mode=TIME_MODE)

m_t_b["model"] = "BoXHED"
m_t_l["model"] = "LSTM"
df_time_test = pd.concat([m_t_b, m_t_l], axis=0, ignore_index=True)
display(df_time_test)

for metric in ["auc", "ap", "pos_rate"]:
    plot_compare(df_time_test, metric, split_name="TEST")
