## 데이터 로드 및 기본 데이터셋 구성

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd

# ====== 데이터 로드 ======
train_candidates = [
    Path("../data/29757_train_merged.csv"),
    Path("../data/1000_train_merged.csv"),
    Path("../data/10000_train_merged.csv"),
]
test_candidates = [
    Path("../data/29757_test_merged.csv"),
    Path("../data/1000_test_merged.csv"),
    Path("../data/10000_test_merged.csv"),
]

def resolve_path(candidates: list[Path]) -> Path:
    for p in candidates:
        if p.exists():
            return p
    raise FileNotFoundError(f"No dataset found in: {candidates}")

train_df = pd.read_csv(resolve_path(train_candidates))
test_df = pd.read_csv(resolve_path(test_candidates))
print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")




: 

## 데이터 전처리 함수

In [None]:
# ====== 기본 설정 ======
target_col = "died_in_icu"
possible_group_cols = ["patientunitstayid", "patient_id"]
possible_time_cols = ["observationoffset"]

group_col = next((c for c in possible_group_cols if c in train_df.columns), None)
time_col = next((c for c in possible_time_cols if c in train_df.columns), None)
if group_col is None or time_col is None:
    raise ValueError("patient id or time column not found")

numeric_cols = train_df.select_dtypes(include=["number"]).columns
exclude = {target_col, "patient_id", "patientunitstayid", "observationoffset", "feature35", "feature36"}
base_cols = [c for c in numeric_cols if c not in exclude]

# ====== 함수 ======
def add_missing_and_time_features(df: pd.DataFrame, base_cols: list[str]) -> pd.DataFrame:
    df = df.sort_values([group_col, time_col]).copy()
    for col in base_cols:
        miss = df[col].isna().astype(int)
        last_time = df[time_col].where(df[col].notna()).groupby(df[group_col]).ffill()
        tsince = df[time_col] - last_time
        tsince = tsince.fillna(df[time_col])
        df[f"{col}_miss"] = miss
        #df[f"{col}_tsince"] = tsince
    return df

def impute_base(df: pd.DataFrame, base_cols: list[str], medians: pd.Series) -> pd.DataFrame:
    df = df.sort_values([group_col, time_col]).copy()
    df[base_cols] = df.groupby(group_col, sort=False)[base_cols].ffill()
    df[base_cols] = df[base_cols].fillna(medians)
    return df

def sample_k_per_stay(df, group_col, time_col, k=10):
    df = df.sort_values([group_col, time_col]).copy()
    def pick_rows(g):
        n = len(g)
        if n <= k:
            return g
        idx = np.linspace(0, n - 1, k, dtype=int)
        return g.iloc[idx]
    return df.groupby(group_col, group_keys=False).apply(pick_rows)

def balance_ratio(df, target_col="died_in_icu", ratio_pos=0.08, random_state=42):
    pos = df[df[target_col] == 1]
    neg = df[df[target_col] == 0]
    desired_neg = int(len(pos) * (1 - ratio_pos) / ratio_pos)

    if desired_neg >= len(neg):
        desired_pos = int(len(neg) * ratio_pos / (1 - ratio_pos))
        pos = pos.sample(n=desired_pos, random_state=random_state)
    else:
        neg = neg.sample(n=desired_neg, random_state=random_state)

    return pd.concat([pos, neg]).sample(frac=1, random_state=random_state).reset_index(drop=True)

# ====== 나이 필터(옵션) ======
def apply_age_filter(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    age_col: str = "age",   # 실제 컬럼명으로 수정
    min_age: int | None = None,
    max_age: int | None = None,
    enabled: bool = True,
):
    if not enabled or age_col not in train_df.columns or age_col not in test_df.columns:
        return train_df, test_df
    t = train_df
    s = test_df
    if min_age is not None:
        t = t[t[age_col] >= min_age]
        s = s[s[age_col] >= min_age]
    if max_age is not None:
        t = t[t[age_col] <= max_age]
        s = s[s[age_col] <= max_age]
    return t, s

def sample_total_rows_by_group(df, group_col, n_total=1000, random_state=42):
    rng = np.random.default_rng(random_state)
    groups = {gid: g.index.to_numpy() for gid, g in df.groupby(group_col)}
    group_ids = list(groups.keys())
    rng.shuffle(group_ids)
    for gid in group_ids:
        rng.shuffle(groups[gid])

    picks = []
    while len(picks) < n_total and group_ids:
        next_group_ids = []
        for gid in group_ids:
            if len(picks) >= n_total:
                break
            idxs = groups[gid]
            if len(idxs) == 0:
                continue
            picks.append(idxs[0])
            groups[gid] = idxs[1:]
            if len(groups[gid]) > 0:
                next_group_ids.append(gid)
        group_ids = next_group_ids

    result = df.loc[picks].sample(frac=1, random_state=random_state).reset_index(drop=True)
    return result

## 최종 데이터셋 구성

In [None]:
# ====== 사용 피처 ======

feature_cols = base_cols


# ====== 나이 필터 적용 (옵션) ======
train_df, test_df = apply_age_filter(train_df, test_df, age_col="feature1", min_age=18, max_age=60, enabled=True)
print(f"Filtered Data shape: {train_df.shape}, Test shape: {test_df.shape}")

# ====== 파이프라인: group 기준 랜덤 1000개 샘플 → 결측치 처리 ======
train_proc = sample_total_rows_by_group(train_df, group_col, n_total=1000, random_state=42)
test_proc = sample_total_rows_by_group(test_df, group_col, n_total=1000, random_state=42)

# ====== 파이프라인: stayid별 10개 샘플 → 결측치 처리 ======
#train_proc = sample_k_per_stay(train_df, group_col, time_col, k=10)
#test_proc = sample_k_per_stay(test_df, group_col, time_col, k=10)


# ===== 파생변수 생성 & 추가======

# train_proc = add_missing_and_time_features(train_proc, base_cols)
# test_proc = add_missing_and_time_features(test_df, base_cols)

# ===== 결측치 처리 =====
train_medians = train_proc[base_cols].median()
train_proc = impute_base(train_proc, base_cols, train_medians)
test_proc = impute_base(test_proc, base_cols, train_medians)

# ====== 클래스 비율 맞추기 (train만) ======
train_proc = balance_ratio(train_proc, target_col=target_col, ratio_pos=0.08)
#test_proc = balance_ratio(test_proc, target_col=target_col, ratio_pos=0.08)

# ====== 데이터셋 구성 ======
X_train = train_proc[feature_cols]
y_train = train_proc[target_col] 

X_test = test_proc[feature_cols]
y_test = test_proc[target_col]


: 

In [None]:
import matplotlib.pyplot as plt

target_col = "died_in_icu"

datasets = {
    "train_proc": train_proc,
    "test_proc": test_proc,
}

fig, axes = plt.subplots(1, 2, figsize=(10, 6), sharey=True)

for ax, (name, df) in zip(axes, datasets.items()):
    counts = df[target_col].value_counts().sort_index()
    ratios = counts / counts.sum()

    ax.bar(ratios.index.astype(str), ratios.values, color=["tab:blue", "tab:red"])
    ax.set_xlabel(target_col)
    ax.set_title(f"Class Ratio in {name}")

    for i, v in enumerate(ratios.values):
        ax.text(i, v + 0.01, f"{v:.2%}", ha="center")

axes[0].set_ylabel("Ratio")
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid", context="talk")

def snapshot(df, step, split, group_col, target_col, base_cols):
    return {
        "step": step,
        "split": split,
        "rows": len(df),
        "stays": df[group_col].nunique(),
        "pos_rate": df[target_col].mean(),
        "missing_rate": df[base_cols].isna().mean().mean(),
    }

def plot_overview(summary_df, title="Preprocessing Overview"):
    fig, axes = plt.subplots(2, 2, figsize=(14, 9))
    sns.barplot(data=summary_df, x="step", y="rows", hue="split", ax=axes[0, 0])
    axes[0, 0].set_title("Rows by Step")
    axes[0, 0].set_ylabel("rows")

    sns.barplot(data=summary_df, x="step", y="stays", hue="split", ax=axes[0, 1])
    axes[0, 1].set_title("Unique Stays by Step")
    axes[0, 1].set_ylabel("unique stays")

    sns.barplot(data=summary_df, x="step", y="missing_rate", hue="split", ax=axes[1, 0])
    axes[1, 0].set_title("Avg Missing Rate (base_cols)")
    axes[1, 0].set_ylabel("missing rate")

    sns.barplot(data=summary_df, x="step", y="pos_rate", hue="split", ax=axes[1, 1])
    axes[1, 1].set_title("Positive Rate")
    axes[1, 1].set_ylabel("pos rate")

    for ax in axes.ravel():
        ax.tick_params(axis="x", rotation=20)
        ax.legend(loc="best")
    plt.suptitle(title, y=1.02, fontsize=16)
    plt.tight_layout()
    plt.show()

def plot_age_filter(train_before, train_after, test_before, test_after, age_col):
    fig, axes = plt.subplots(1, 2, figsize=(14, 5), sharey=True)
    sns.histplot(train_before[age_col], bins=30, color="#999999", label="before", stat="density", ax=axes[0])
    sns.histplot(train_after[age_col], bins=30, color="#2ca02c", label="after", stat="density", ax=axes[0])
    axes[0].set_title("Train Age Distribution")
    axes[0].legend()

    sns.histplot(test_before[age_col], bins=30, color="#999999", label="before", stat="density", ax=axes[1])
    sns.histplot(test_after[age_col], bins=30, color="#1f77b4", label="after", stat="density", ax=axes[1])
    axes[1].set_title("Test Age Distribution")
    axes[1].legend()

    plt.tight_layout()
    plt.show()

def plot_sampling_per_stay(before_df, after_df, group_col, title):
    before_counts = before_df.groupby(group_col).size()
    after_counts = after_df.groupby(group_col).size()

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    sns.histplot(before_counts, bins=30, color="#999999", ax=axes[0])
    axes[0].set_title(f"{title}: rows per stay (before)")
    axes[0].set_xlabel("rows per stay")
    axes[0].set_xlim(0, 1300)
    sns.histplot(after_counts, bins=30, color="#ff7f0e", ax=axes[1])
    axes[1].set_title(f"{title}: rows per stay (after)")
    axes[1].set_xlabel("rows per stay")
    plt.tight_layout()
    plt.show()

def plot_missing_indicator(df, base_cols, title, top_n=12):
    miss_cols = [f"{c}_miss" for c in base_cols if f"{c}_miss" in df.columns]
    miss_rate = df[miss_cols].mean().sort_values(ascending=False).head(top_n)
    plt.figure(figsize=(10, 5))
    sns.barplot(x=miss_rate.values, y=miss_rate.index, color="#9467bd")
    plt.title(f"{title}: top missing indicators")
    plt.xlabel("mean missing (1=missing)")
    plt.ylabel("")
    plt.tight_layout()
    plt.show()

def plot_imputation_effect(before_df, after_df, base_cols, title):
    before_rate = before_df[base_cols].isna().mean().mean()
    after_rate = after_df[base_cols].isna().mean().mean()
    plt.figure(figsize=(6, 4))
    sns.barplot(x=["before", "after"], y=[before_rate, after_rate], color="#8c564b")
    plt.title(f"{title}: missing rate before vs after impute")
    plt.ylabel("avg missing rate")
    plt.tight_layout()
    plt.show()

def plot_class_balance(before_df, after_df, target_col, title):
    before_counts = before_df[target_col].value_counts().sort_index()
    after_counts = after_df[target_col].value_counts().sort_index()
    df_plot = pd.DataFrame({
        "class": ["0", "1", "0", "1"],
        "count": [before_counts.get(0, 0), before_counts.get(1, 0),
                  after_counts.get(0, 0), after_counts.get(1, 0)],
        "stage": ["before", "before", "after", "after"],
    })
    plt.figure(figsize=(6, 4))
    sns.barplot(data=df_plot, x="class", y="count", hue="stage")
    plt.title(f"{title}: class balance")
    plt.xlabel(target_col)
    plt.tight_layout()
    plt.show()


## 시각화

In [None]:
# ====== raw snapshot ======
snapshots = []
train_raw = train_df.copy()
test_raw = test_df.copy()
snapshots.append(snapshot(train_raw, "raw", "train", group_col, target_col, base_cols))
snapshots.append(snapshot(test_raw, "raw", "test", group_col, target_col, base_cols))

# ====== 나이 필터 적용 ======
train_df, test_df = apply_age_filter(
    train_df, test_df, age_col="feature1", min_age=18, max_age=60, enabled=True
)
plot_age_filter(train_raw, train_df, test_raw, test_df, age_col="feature1")
snapshots.append(snapshot(train_df, "age_filter", "train", group_col, target_col, base_cols))
snapshots.append(snapshot(test_df, "age_filter", "test", group_col, target_col, base_cols))

# ====== stay별 10개 샘플 ======
train_before_sample = train_df.copy()
test_before_sample = test_df.copy()
train_proc = sample_k_per_stay(train_df, group_col, time_col, k=10)
test_proc = sample_k_per_stay(test_df, group_col, time_col, k=10)
plot_sampling_per_stay(train_before_sample, train_proc, group_col, "Train")
plot_sampling_per_stay(test_before_sample, test_proc, group_col, "Test")
snapshots.append(snapshot(train_proc, "sample_k", "train", group_col, target_col, base_cols))
snapshots.append(snapshot(test_proc, "sample_k", "test", group_col, target_col, base_cols))

# ====== 파생변수 생성 ======
# train_proc = add_missing_and_time_features(train_proc, base_cols)
# test_proc = add_missing_and_time_features(test_proc, base_cols)
# plot_missing_indicator(train_proc, base_cols, "Train")
# plot_missing_indicator(test_proc, base_cols, "Test")
# snapshots.append(snapshot(train_proc, "add_missing", "train", group_col, target_col, base_cols))
# snapshots.append(snapshot(test_proc, "add_missing", "test", group_col, target_col, base_cols))

# ====== 결측치 처리 ======
train_before_impute = train_proc.copy()
test_before_impute = test_proc.copy()
train_medians = train_proc[base_cols].median()
train_proc = impute_base(train_proc, base_cols, train_medians)
test_proc = impute_base(test_proc, base_cols, train_medians)
plot_imputation_effect(train_before_impute, train_proc, base_cols, "Train")
plot_imputation_effect(test_before_impute, test_proc, base_cols, "Test")
snapshots.append(snapshot(train_proc, "impute", "train", group_col, target_col, base_cols))
snapshots.append(snapshot(test_proc, "impute", "test", group_col, target_col, base_cols))

# ====== 클래스 비율 맞추기 ======
train_before_balance = train_proc.copy()
train_proc = balance_ratio(train_proc, target_col=target_col, ratio_pos=0.08)
plot_class_balance(train_before_balance, train_proc, target_col, "Train")
snapshots.append(snapshot(train_proc, "balance", "train", group_col, target_col, base_cols))

# ====== 요약 테이블 + Overview Plot ======
summary_df = pd.DataFrame(snapshots)
display(summary_df)
plot_overview(summary_df)


In [None]:
def plot_sampling_per_stay(before_df, after_df, group_col, title):
    before_counts = before_df.groupby(group_col).size()
    after_counts = after_df.groupby(group_col).size()

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    sns.histplot(before_counts, bins=30, color="#999999", ax=axes[0])
    axes[0].set_title(f"{title}: rows per stay (before)")
    axes[0].set_xlabel("rows per stay")
    axes[0].set_xlim(0, 1100)
    sns.histplot(after_counts, bins=30, color="#ff7f0e", ax=axes[1])
    axes[1].set_title(f"{title}: rows per stay (after)")
    axes[1].set_xlabel("rows per stay")
    plt.tight_layout()
    plt.show()
train_before_sample = train_df.copy()
test_before_sample = test_df.copy()
train_proc = sample_k_per_stay(train_df, group_col, time_col, k=10)
test_proc = sample_k_per_stay(test_df, group_col, time_col, k=10)
plot_sampling_per_stay(train_before_sample, train_proc, group_col, "Train")
plot_sampling_per_stay(test_before_sample, test_proc, group_col, "Test")
snapshots.append(snapshot(train_proc, "sample_k", "train", group_col, target_col, base_cols))
snapshots.append(snapshot(test_proc, "sample_k", "test", group_col, target_col, base_cols))

In [None]:
def calc_data_loss(before_df, after_df, group_col, label=""):
    rows_before = len(before_df)
    rows_after = len(after_df)
    row_loss = rows_before - rows_after
    row_loss_rate = row_loss / rows_before if rows_before else 0

    stays_before = before_df[group_col].nunique()
    stays_after = after_df[group_col].nunique()
    stay_loss = stays_before - stays_after
    stay_loss_rate = stay_loss / stays_before if stays_before else 0

    print(f"[{label}] rows: {rows_before} -> {rows_after} (loss {row_loss:,}, {row_loss_rate:.2%})")
    print(f"[{label}] stays: {stays_before} -> {stays_after} (loss {stay_loss:,}, {stay_loss_rate:.2%})")
    return {
        "label": label,
        "rows_before": rows_before,
        "rows_after": rows_after,
        "row_loss": row_loss,
        "row_loss_rate": row_loss_rate,
        "stays_before": stays_before,
        "stays_after": stays_after,
        "stay_loss": stay_loss,
        "stay_loss_rate": stay_loss_rate,
    }

def plot_data_loss(loss_dicts, title="Data Loss After Sampling"):
    df = pd.DataFrame(loss_dicts)
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    sns.barplot(data=df, x="label", y="row_loss_rate", ax=axes[0], color="#d62728")
    axes[0].set_title("Row Loss Rate")
    axes[0].set_ylabel("loss rate")
    axes[0].set_ylim(0, 1)
    axes[0].tick_params(axis="x", rotation=15)

    sns.barplot(data=df, x="label", y="stay_loss_rate", ax=axes[1], color="#9467bd")
    axes[1].set_title("Stay Loss Rate")
    axes[1].set_ylabel("loss rate")
    axes[1].set_ylim(0, 1)
    axes[1].tick_params(axis="x", rotation=15)

    plt.suptitle(title)
    plt.tight_layout()
    plt.show()

# 사용 예시 (sampling 전후)
train_loss = calc_data_loss(train_before_sample, train_proc, group_col, label="Train sample_k")
test_loss = calc_data_loss(test_before_sample, test_proc, group_col, label="Test sample_k")
plot_data_loss([train_loss, test_loss], title="Sampling Data Loss")


In [None]:
import matplotlib.pyplot as plt

target_col = "died_in_icu"

datasets = {
    "train_proc": train_proc,
    "test_proc": test_proc,
}

fig, axes = plt.subplots(1, 2, figsize=(8, 4), sharey=True)

for ax, (name, df) in zip(axes, datasets.items()):
    counts = df[target_col].value_counts().sort_index()
    ratios = counts / counts.sum()

    ax.bar(ratios.index.astype(str), ratios.values, color=["tab:blue", "tab:red"])
    ax.set_xlabel(target_col)
    ax.set_title(f"Class Ratio in {name}")

    for i, v in enumerate(ratios.values):
        ax.text(i, v + 0.01, f"{v:.2%}", ha="center")

axes[0].set_ylabel("Ratio")
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def age_group_stats(df, age_col="age", group_col="patientunitstayid", time_col="observationoffset"):
    # 나이 구간 정의(필요하면 수정)
    bins = [0, 18, 30, 40, 50, 60, 70, 80, 200]
    labels = ["0-17","18-29","30-39","40-49","50-59","60-69","70-79","80+"]

    g = df.copy()
    g["age_group"] = pd.cut(g[age_col], bins=bins, labels=labels, right=False)

    # 그룹별 환자 수(중복 제거)
    stay_counts = g.groupby("age_group")[group_col].nunique()

    # 그룹별 타임시리즈(offset) 개수: 행 수
    offset_counts = g.groupby("age_group").size()

    # 출력
    print("== Age group -> unique stays ==")
    print(stay_counts)
    print("\n== Age group -> total offsets(rows) ==")
    print(offset_counts)

    # 시각화
    fig, axes = plt.subplots(1, 2, figsize=(10, 4), sharey=False)
    stay_counts.plot(kind="bar", ax=axes[0], color="tab:blue", title="Unique stays by age group")
    axes[0].set_ylabel("Count")
    axes[0].set_xlabel("Age group")

    offset_counts.plot(kind="bar", ax=axes[1], color="tab:orange", title="Offsets(rows) by age group")
    axes[1].set_ylabel("Count")
    axes[1].set_xlabel("Age group")

    plt.tight_layout()
    plt.show()

# 사용 예시
age_group_stats(train_df, age_col="feature1", group_col=group_col, time_col=time_col)


In [None]:
print(len(feature_cols))
print(feature_cols)

## SMOTE

In [None]:
# SMOTE로 불균형 처리 (train만)
# 필요: pip install imbalanced-learn (또는 uv add imbalanced-learn)
from imblearn.over_sampling import SMOTE

# 학습 데이터만 SMOTE 적용
# smote = SMOTE(sampling_strategy=0.5 / 0.5, random_state=42, k_neighbors=5)
# X_train, y_train = smote.fit_resample(X_train, y_train)
# X_test, y_test = smote.fit_resample(X_test, y_test) 
print("before:", y_train.value_counts(normalize=True))
print("after :", pd.Series(y_train).value_counts(normalize=True))

# 이후 모델 학습에 X_train_sm, y_train_sm 사용


## 모델 학습

In [None]:
import xgboost as xgb
import numpy as np
import pandas as pd

models = {}
# 고정값으로 98:2 가정
scale_pos_weight = 92/8

models["XGBoost"] = xgb.XGBClassifier(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,
)


def aggregate_patient_scores(df: pd.DataFrame, scores: np.ndarray):
    tmp = df[[group_col, target_col]].copy()
    tmp["score"] = scores
    y_patient = tmp.groupby(group_col)[target_col].max()
    s_patient = tmp.groupby(group_col)["score"].mean()
    return y_patient, s_patient

pred_scores = {}

for name, model in models.items():
    model.fit(X_train, y_train)

    if hasattr(model, "predict_proba"):
        scores = model.predict_proba(X_test)[:, 1]
    else:
        raw = model.decision_function(X_test)
        scores = 1 / (1 + np.exp(-raw))

    pred_scores[name] = scores

print("models used:", list(pred_scores.keys()))
print("test n:", len(y_test))


In [None]:
# 시각화: ROC + Calibration + Decision Curve (SMOTE test 기준, row-level)
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.calibration import calibration_curve
import numpy as np

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# 1) ROC
ax = axes[0]
for name, scores in pred_scores.items():
    fpr, tpr, _ = roc_curve(y_test, scores)
    auc = roc_auc_score(y_test, scores)
    ax.plot(fpr, tpr, label=f"{name} (AUC={auc:.3f})")
ax.plot([0, 1], [0, 1], "k--", alpha=0.5)
ax.set_xlabel("1 - Specificity")
ax.set_ylabel("Sensitivity")
ax.set_title("ROC Curves (SMOTE test)")
ax.legend(loc="lower right")

# 2) Calibration
ax = axes[1]
for name, scores in pred_scores.items():
    prob_true, prob_pred = calibration_curve(y_test, scores, n_bins=10, strategy="uniform")
    ax.plot(prob_pred, prob_true, marker="o", label=name)
ax.plot([0, 1], [0, 1], "k--", alpha=0.5, label="Ideal")
ax.set_xlabel("Mean predicted probability")
ax.set_ylabel("Fraction of positives")
ax.set_title("Calibration Curves (SMOTE test)")
ax.legend(loc="lower right")

# 3) Decision Curve
ax = axes[2]
thresholds = np.linspace(0.01, 0.99, 99)

def decision_curve(y_true, scores):
    n = len(y_true)
    net_benefit = []
    for t in thresholds:
        preds = scores >= t
        tp = ((preds == 1) & (y_true == 1)).sum()
        fp = ((preds == 1) & (y_true == 0)).sum()
        nb = (tp / n) - (fp / n) * (t / (1 - t))
        net_benefit.append(nb)
    return np.array(net_benefit)

for name, scores in pred_scores.items():
    nb = decision_curve(np.array(y_test), scores)
    ax.plot(thresholds, nb, label=name)

prevalence = np.mean(y_test)
nb_all = prevalence - (1 - prevalence) * (thresholds / (1 - thresholds))
ax.plot(thresholds, nb_all, "k--", alpha=0.4, label="Treat all")
ax.plot(thresholds, np.zeros_like(thresholds), "k:", alpha=0.6, label="Treat none")

ax.set_xlabel("Threshold probability")
ax.set_ylabel("Net benefit")
ax.set_title("Decision Curve Analysis (SMOTE test)")
ax.legend(loc="upper right")

plt.tight_layout()
plt.show()


In [None]:
# Confusion Matrix 서브플롯 (환자 단위 집계 기준)
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import math
import numpy as np

n_models = len(pred_scores)
n_cols = 3
n_rows = math.ceil(n_models / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
axes = np.array(axes).reshape(-1)

for ax, (name, scores) in zip(axes, pred_scores.items()):
    y_pred = (scores >= 0.5).astype(int)
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(ax=ax, cmap="Blues", colorbar=False)
    ax.set_title(f"{name}")

for ax in axes[n_models:]:
    ax.axis("off")

plt.tight_layout()
plt.show()


In [None]:
from __future__ import annotations

import numpy as np
import pandas as pd

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score
)

def evaluate_row_level(
    pred_scores: dict[str, np.ndarray | pd.Series],
    y_true: np.ndarray | pd.Series,
    threshold: float = 0.5,
) -> pd.DataFrame:
    """
    row-level 성능 평가 (SMOTE test 기준)
    """
    y_true = np.asarray(y_true).astype(int)

    results = []
    for name, scores in pred_scores.items():
        y_score = np.asarray(scores).astype(float)
        y_pred = (y_score >= threshold).astype(int)

        acc = accuracy_score(y_true, y_pred)
        prec = precision_score(y_true, y_pred, zero_division=0)
        rec = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)

        if np.unique(y_true).size < 2:
            auc = np.nan
            ap = np.nan
        else:
            auc = roc_auc_score(y_true, y_score)
            ap = average_precision_score(y_true, y_score)

        results.append({
            "model": name,
            "N_rows": len(y_true),
            "pos_rate": float(y_true.mean()),
            "threshold": threshold,
            "ACC": acc,
            "PREC": prec,
            "REC": rec,
            "F1": f1,
            "AUC": auc,
            "PR-AUC": ap,
        })

    out = (
        pd.DataFrame(results)
        .set_index("model")
        .sort_values(["PR-AUC", "AUC"], ascending=False)
    )
    return out


# 실행 (SMOTE test 기준)
metrics_df = evaluate_row_level(
    pred_scores=pred_scores,
    y_true=y_test,
    threshold=0.5
)

print(metrics_df.to_string(float_format=lambda x: f"{x:.4f}"))


In [None]:
# SHAP 분석 + 중요 피처 Top-N (row-level, 현재 models 기준)
import shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

X_shap = X_test.sample(n=min(1000, len(X_test)), random_state=42)
top_n = len(feature_cols)

for name, model in models.items():
    try:
        if name in ["RF", "XGBoost", "LightGBM", "CatBoost"]:
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X_shap)
        else:
            bg = X_shap.sample(100, random_state=42)
            explainer = shap.KernelExplainer(model.predict_proba, bg)
            shap_values = explainer.shap_values(X_shap, nsamples=100)

        if isinstance(shap_values, list) and len(shap_values) > 1:
            sv = shap_values[1]
        else:
            sv = shap_values

        mean_abs = np.abs(sv).mean(axis=0)
        imp = pd.Series(mean_abs, index=X_shap.columns).sort_values(ascending=False)

        print(f"\n{name} Top {top_n} features:")
        print(imp.head(top_n))

        plt.figure(figsize=(10, 10))
        imp.head(top_n).sort_values().plot(kind="barh")
        plt.title(f"{name} SHAP Feature Importance (Top {top_n})")
        plt.tight_layout()
        plt.show()

        shap.summary_plot(sv, X_shap, show=True, plot_type="bar")
        shap.summary_plot(sv, X_shap, show=True)

    except Exception as e:
        print(f"SHAP failed for {name}: {e}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

cols = feature_cols
# 상관관계 계산
corr = X_train[cols].corr(method="pearson")
display(corr)

# 히트맵 시각화
plt.figure(figsize=(20, 20))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation (Selected Features)")
plt.tight_layout()
plt.show()


In [None]:
# feature_cols 기반 사망/생존별 결측률
target_col = "died_in_icu"

missing_rate_by_t = (
    train_df[feature_cols]
    .isna()
    .groupby(train_df[target_col])
    .mean()
)

# 보기 좋게 전치 (feature 행, target 컬럼)
missing_rate_by_t_T = missing_rate_by_t.T
missing_rate_by_t_T.columns = [f"{target_col}={c}" for c in missing_rate_by_t_T.columns]

display(missing_rate_by_t_T)
