<a href="https://colab.research.google.com/github/peculab/AI4JUBO/blob/main/JuboDeath_V3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install shap plotly xgboost --quiet

In [None]:
!pip install ace_tools

Collecting ace_tools
  Downloading ace_tools-0.0-py3-none-any.whl.metadata (300 bytes)
Downloading ace_tools-0.0-py3-none-any.whl (1.1 kB)
Installing collected packages: ace_tools
Successfully installed ace_tools-0.0


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from IPython.display import display
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, confusion_matrix, mean_absolute_error, r2_score
)

In [None]:
# 載入與清理資料
chunks = pd.read_csv("trainingData.csv", chunksize=5000, low_memory=False, on_bad_lines='skip')
df = pd.concat(chunks, ignore_index=True)

In [None]:
# 前 13 個重要特徵（由圖中得出）
top_13_features = [
    '預估年齡', 'ADL_總分_max', '六個月內住院次數', '意識清醒_max', '使用呼吸輔具',
    '性別_is_male', 'CMS_value', 'SOF_總分_max', '體重變化_max', 'GDS_總分_max',
    '跌倒次數_std', '跌倒次數_max_1', '藥物數_max'
]

In [None]:
df_sub = df[top_13_features]
y = df['死亡標記']

In [None]:
def impute_features(df):
    df = df.copy()

    # 1. 類別 + 計數型 → 缺值視為 0
    for col in ['六個月內住院次數', '使用呼吸輔具', '跌倒次數_std', '跌倒次數_max_1']:
        if col in df.columns:
            df[col] = df[col].fillna(0)

    # 2. 意識狀態（類別型）→ 補眾數
    if '意識清醒_max' in df.columns:
        mode_val = df['意識清醒_max'].mode()
        df['意識清醒_max'] = df['意識清醒_max'].fillna(mode_val[0] if not mode_val.empty else 0)

    # 3. 性別分群補中位數
    for col in ['ADL_總分_max', 'GDS_總分_max', 'CMS_value']:
        if col in df.columns and '性別_is_male' in df.columns:
            df[col] = df.groupby('性別_is_male')[col].transform(lambda x: x.fillna(x.median()))

    # 4. CMS 分層補 SOF
    if 'SOF_總分_max' in df.columns and 'CMS_value' in df.columns:
        try:
            df['CMS_level'] = pd.qcut(df['CMS_value'], 3, duplicates='drop')
            df['SOF_總分_max'] = df.groupby('CMS_level')['SOF_總分_max'].transform(lambda x: x.fillna(x.median()))
            df.drop(columns=['CMS_level'], inplace=True)
        except:
            df['SOF_總分_max'] = df['SOF_總分_max'].fillna(df['SOF_總分_max'].median())

    # 5. 體重變化、藥物數 → 時序補值（或平均）
    for col in ['體重變化_max', '藥物數_max']:
        if col in df.columns:
            df[col] = df[col].ffill().bfill().fillna(df[col].mean())

    # 6. 防呆補值：保證沒有任何 NaN
    df = df.fillna(0)

    return df

In [None]:
X_xgb = impute_features(df_sub)
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_xgb, y, test_size=0.2, random_state=42)
# ✅ Step 2: 二階段分類模型（是否死亡）
clf_grid = {
    'n_estimators': [500],       # 樹的數量（越多越穩但越慢）
    'max_depth': [7],                # 每棵樹的最大深度
    'learning_rate': [0.05],    # 學習速率（太高易過擬合，太低學太慢）
    'subsample': [0.8],          # 每次建樹使用樣本比例（小可防過擬合）
    'colsample_bytree': [0.8],   # 每棵樹使用特徵比例
}
clf = GridSearchCV(XGBClassifier(random_state=42), clf_grid, cv=3, scoring='accuracy')
clf.fit(X_train_cls, y_train_cls)
best_clf = clf.best_estimator_
y_pred_cls = best_clf.predict(X_test_cls)





In [None]:
print("最佳參數組合：", clf.best_params_)

最佳參數組合： {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.8}


In [None]:
import plotly.express as px
import pandas as pd

# 從 best_clf 擷取特徵重要性
booster = best_clf.get_booster()
importance_dict = booster.get_score(importance_type='weight')

# 轉換為 DataFrame 並排序
importance_df = pd.DataFrame({
    "Feature": list(importance_dict.keys()),
    "Importance": list(importance_dict.values())
}).sort_values(by="Importance", ascending=False)

# 取前 10 名
top10 = importance_df.head(10)

# 繪圖
fig = px.bar(
    top10.sort_values(by="Importance"),
    x="Importance",
    y="Feature",
    orientation='h',
    title="🔍 Top 10 Features by XGBoost Importance",
    labels={"Importance": "重要性分數", "Feature": "特徵名稱"},
    width=700,
    height=500
)
fig.update_layout(yaxis=dict(dtick=1))
fig.show()

In [None]:
# ✅ Step 3: 分類視覺化與報告
conf_matrix = confusion_matrix(y_test_cls, y_pred_cls)
labels = ["非死亡", "死亡"]
fig_confusion = go.Figure(data=go.Heatmap(
    z=conf_matrix, x=labels, y=labels, colorscale='Blues', text=conf_matrix,
    texttemplate="%{text}", hovertemplate='預測: %{x}<br>實際: %{y}<br>數量: %{z}<extra></extra>'
))
fig_confusion.update_layout(title="📊 混淆矩陣", xaxis_title="預測", yaxis_title="實際")

In [None]:
from sklearn.metrics import roc_curve, auc
import plotly.graph_objects as go

# 取得預測為「死亡（1）」的機率
y_pred_proba = best_clf.predict_proba(X_test_cls)[:, 1]

# 計算 FPR, TPR 與 AUC
fpr, tpr, thresholds = roc_curve(y_test_cls, y_pred_proba)
roc_auc = auc(fpr, tpr)

# 畫圖
fig_roc = go.Figure()
fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f"ROC Curve (AUC = {roc_auc:.2f})"))
fig_roc.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', line=dict(dash='dash'), name='Random Baseline'))

fig_roc.update_layout(
    title="📈 ROC 曲線（XGBoost 二分類）",
    xaxis_title="False Positive Rate (假陽性率)",
    yaxis_title="True Positive Rate (真正率)",
    width=700,
    height=500
)

fig_roc.show()

In [None]:
# Alrawi (2013)
X_a = impute_features(df_sub[['意識清醒_max', '使用呼吸輔具', 'CMS_value', '性別_is_male']])
X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X_a, y, test_size=0.2, random_state=42)
model_a = LogisticRegression(max_iter=1000)
model_a.fit(X_train_a, y_train_a)
y_pred_a = model_a.predict(X_test_a)
y_prob_a = model_a.predict_proba(X_test_a)[:, 1]

# Chandra (2022)
X_c = impute_features(df_sub[['預估年齡', '六個月內住院次數', 'ADL_總分_max', 'GDS_總分_max']])
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y, test_size=0.2, random_state=42)
model_c = XGBClassifier(n_estimators=200, max_depth=4, learning_rate=0.05, random_state=42)
model_c.fit(X_train_c, y_train_c)
y_pred_c = model_c.predict(X_test_c)
y_prob_c = model_c.predict_proba(X_test_c)[:, 1]

# García-Gollarte (2020)
X_g = impute_features(df_sub[['ADL_總分_max', '體重變化_max', 'GDS_總分_max']])
X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(X_g, y, test_size=0.2, random_state=42)
model_g = RandomForestClassifier(n_estimators=100, random_state=42)
model_g.fit(X_train_g, y_train_g)
y_pred_g = model_g.predict(X_test_g)
y_prob_g = model_g.predict_proba(X_test_g)[:, 1]

# Levy (2015)
X_l = impute_features(df_sub[['CMS_value', '使用呼吸輔具', '藥物數_max']])
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(X_l, y, test_size=0.2, random_state=42)
model_l = LogisticRegression(max_iter=1000)
model_l.fit(X_train_l, y_train_l)
y_pred_l = model_l.predict(X_test_l)
y_prob_l = model_l.predict_proba(X_test_l)[:, 1]

In [None]:
from sklearn.metrics import roc_curve, auc
import plotly.graph_objects as go

# 建立模型名稱與對應的預測機率與實際值列表
model_info = {
    "NTNU PecuLab (2025)" : (y_test_cls, y_pred_proba),
    "Alrawi (2013)": (y_test_a, y_prob_a),
    "Chandra (2022)": (y_test_c, y_prob_c),
    "García-Gollarte (2020)": (y_test_g, y_prob_g),
    "Levy (2015)": (y_test_l, y_prob_l),
}

# 建立圖表
fig_roc_all = go.Figure()

# 加入每個模型的 ROC 曲線
for model_name, (y_true, y_score) in model_info.items():
    fpr, tpr, _ = roc_curve(y_true, y_score)
    roc_auc = auc(fpr, tpr)
    fig_roc_all.add_trace(go.Scatter(
        x=fpr, y=tpr, mode='lines',
        name=f"{model_name} (AUC = {roc_auc:.2f})"
    ))

# 加入隨機基準線
fig_roc_all.add_trace(go.Scatter(
    x=[0, 1], y=[0, 1], mode='lines',
    line=dict(dash='dash'), name='Random Baseline'
))

# 設定圖表樣式
fig_roc_all.update_layout(
    title="📊 多模型 ROC 曲線比較",
    xaxis_title="False Positive Rate (假陽性率)",
    yaxis_title="True Positive Rate (真正率)",
    width=800,
    height=600
)

fig_roc_all.show()