<a href="https://colab.research.google.com/github/peculab/AI4JUBO/blob/main/JuboDeath_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install shap plotly xgboost --quiet

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, confusion_matrix, mean_absolute_error, r2_score
)

In [None]:
# ✅ Step 1: 載入與清理資料
chunks = pd.read_csv("trainingData.csv", chunksize=5000, low_memory=False, on_bad_lines='skip')
df = pd.concat(chunks, ignore_index=True)

In [None]:
df['CMS_value'].describe()

Unnamed: 0,CMS_value
count,3824.0
mean,3.29001
std,1.940481
min,1.0
25%,2.0
50%,2.0
75%,4.0
max,8.0


In [None]:
len(df)

26330

In [None]:
drop_cols = [c for c in df.columns if '_NUM_' in c or 'dbname_' in c]
df.drop(columns=drop_cols + ['H01_NUM', 'dbname'], errors='ignore', inplace=True)
#df.fillna(df.median(numeric_only=True), inplace=True)

In [None]:
df.head()

Unnamed: 0,死亡標記,觀察天數,性別_is_male,預估年齡,DNR_flag,CDR_value,CMS_value,ADL_總分_max,GDS_總分_max,SOF_總分_max,...,多重用藥,小腿圍_max,體重變化_max,意識清醒_max,跌倒次數_max_1,跌倒次數_std,使用呼吸輔具,活動假牙,使用管路,六個月內住院次數
0,1,178,1,90.0,0,,,,,,...,,,,,,,,,,
1,1,28,1,73.0,0,,,,,,...,,,,0.0,,,0.0,0.0,1.0,1.0
2,1,50,0,91.0,0,,,5.0,0.0,,...,,,,0.0,,,0.0,0.0,1.0,
3,1,74,0,71.0,0,,,10.0,,3.0,...,0.0,,0.0,0.0,,,0.0,0.0,1.0,
4,1,162,1,87.0,0,,,20.0,,,...,,,,0.0,,,0.0,0.0,1.0,1.0


In [None]:
df.columns

Index(['死亡標記', '觀察天數', '性別_is_male', '預估年齡', 'DNR_flag', 'CDR_value',
       'CMS_value', 'ADL_總分_max', 'GDS_總分_max', 'SOF_總分_max', '跌倒次數_max',
       'MMSE_時間定向_max', 'MMSE_位置定向_max', '藥物數_max', '使用精神藥', '多重用藥', '小腿圍_max',
       '體重變化_max', '意識清醒_max', '跌倒次數_max_1', '跌倒次數_std', '使用呼吸輔具', '活動假牙',
       '使用管路', '六個月內住院次數'],
      dtype='object')

In [None]:
# 前 13 個重要特徵（由圖中得出）
top_13_features = [
    '預估年齡', 'ADL_總分_max', '六個月內住院次數', '意識清醒_max', '使用呼吸輔具',
    '性別_is_male', 'CMS_value', 'SOF_總分_max', '體重變化_max', 'GDS_總分_max',
    '跌倒次數_std', '跌倒次數_max_1', '藥物數_max'
]

In [None]:
# 處理特徵與標籤
df_filtered = df[top_13_features + ['死亡標記']]
X_cls = df_filtered.drop(columns=['觀察天數', '死亡標記'], errors='ignore').astype('float32')
y_cls = df_filtered['死亡標記']

In [None]:
# ✅ Step 2: 二階段分類模型（是否死亡）
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)

clf_grid = {
    'n_estimators': [100, 300, 500],       # 樹的數量（越多越穩但越慢）
    'max_depth': [3, 5, 7],                # 每棵樹的最大深度
    'learning_rate': [0.01, 0.05, 0.1],    # 學習速率（太高易過擬合，太低學太慢）
    'subsample': [0.6, 0.8, 1.0],          # 每次建樹使用樣本比例（小可防過擬合）
    'colsample_bytree': [0.6, 0.8, 1.0],   # 每棵樹使用特徵比例
}

clf = GridSearchCV(XGBClassifier(random_state=42), clf_grid, cv=3, scoring='accuracy')
clf.fit(X_train_cls, y_train_cls)
best_clf = clf.best_estimator_
y_pred_cls = best_clf.predict(X_test_cls)

In [None]:
# ✅ Step 2.1: 特徵重要性排序視覺化（分類模型）
clf_feat_imp = pd.Series(best_clf.feature_importances_, index=X_train_cls.columns)
clf_feat_df = clf_feat_imp.abs().sort_values(ascending=False).reset_index()
clf_feat_df.columns = ['特徵', '重要性分數']

import plotly.express as px
fig_clf_feat = px.bar(
    clf_feat_df.head(20),
    x='重要性分數',
    y='特徵',
    orientation='h',
    title='📊 分類模型特徵重要性（XGBoost）',
    labels={'重要性分數': '特徵重要性'}
)
fig_clf_feat.update_layout(yaxis=dict(autorange='reversed'))
fig_clf_feat.show()

In [None]:
# ✅ Step 3: 分類視覺化與報告
conf_matrix = confusion_matrix(y_test_cls, y_pred_cls)
labels = ["非死亡", "死亡"]
fig_confusion = go.Figure(data=go.Heatmap(
    z=conf_matrix, x=labels, y=labels, colorscale='Blues', text=conf_matrix,
    texttemplate="%{text}", hovertemplate='預測: %{x}<br>實際: %{y}<br>數量: %{z}<extra></extra>'
))
fig_confusion.update_layout(title="📊 混淆矩陣", xaxis_title="預測", yaxis_title="實際")

In [None]:
from sklearn.metrics import roc_curve, auc
import plotly.graph_objects as go

# 取得預測為「死亡（1）」的機率
y_pred_proba = best_clf.predict_proba(X_test_cls)[:, 1]

# 計算 FPR, TPR 與 AUC
fpr, tpr, thresholds = roc_curve(y_test_cls, y_pred_proba)
roc_auc = auc(fpr, tpr)

# 畫圖
fig_roc = go.Figure()
fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f"ROC Curve (AUC = {roc_auc:.2f})"))
fig_roc.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', line=dict(dash='dash'), name='Random Baseline'))

fig_roc.update_layout(
    title="📈 ROC 曲線（XGBoost 二分類）",
    xaxis_title="False Positive Rate (假陽性率)",
    yaxis_title="True Positive Rate (真正率)",
    width=700,
    height=500
)

fig_roc.show()

In [None]:
classification_metrics = {
    "Accuracy": accuracy_score(y_test_cls, y_pred_cls),
    "Precision": precision_score(y_test_cls, y_pred_cls),
    "Recall": recall_score(y_test_cls, y_pred_cls),
    "F1": f1_score(y_test_cls, y_pred_cls),
    "ROC AUC": roc_auc_score(y_test_cls, best_clf.predict_proba(X_test_cls)[:, 1])
}
classification_metrics

{'Accuracy': 0.7951006456513483,
 'Precision': 0.7397344859524545,
 'Recall': 0.9103343465045592,
 'F1': 0.8162152955203543,
 'ROC AUC': np.float64(0.8701330566152696)}

In [None]:
# ✅ Step 4: 回歸模型（僅對死亡樣本）
df_dead = df[df['死亡標記'] == 1].copy()
X_reg = df_dead.drop(columns=['觀察天數', '死亡標記'], errors='ignore').astype('float32')
y_reg = df_dead['觀察天數']
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

reg_grid = {
    'n_estimators': [100],
    'max_depth': [3],
    'learning_rate': [0.05],
    'subsample': [0.8]
}
reg = GridSearchCV(XGBRegressor(random_state=42), reg_grid, cv=3, scoring='neg_mean_absolute_error')
reg.fit(X_train_reg, np.log1p(y_train_reg))
best_reg = reg.best_estimator_
y_pred_log = best_reg.predict(X_test_reg)
y_pred = np.expm1(y_pred_log)

In [None]:
# ✅ Step 5: 回歸結果視覺化與報告
fig_regression = px.scatter(
    x=y_test_reg, y=y_pred,
    labels={'x': '實際死亡天數', 'y': '預測死亡天數'},
    title='📉 死亡天數預測：實際 vs 預測'
)
fig_regression.add_shape(
    type='line', x0=0, y0=0, x1=180, y1=180,
    line=dict(color='red', dash='dash')
)

regression_metrics = {
    "MAE": mean_absolute_error(y_test_reg, y_pred),
    "R²": r2_score(y_test_reg, y_pred)
}
regression_metrics

{'MAE': 36.28933334350586, 'R²': 0.06138831377029419}

In [None]:
# ✅ Step 6: 死亡觀察天數分布圖
fig_death_dist = px.histogram(
    df_dead,
    x='觀察天數',
    nbins=50,
    title='🪦 死亡樣本觀察天數分布',
    marginal='box',
    labels={'觀察天數': '從入家到死亡的天數'}
)
fig_death_dist

In [None]:
# ✅ Step 7: 模擬 SHAP 特徵排序（以平均絕對值近似）
reg_feat_imp = pd.Series(best_reg.feature_importances_, index=X_reg.columns)
reg_feat_df = reg_feat_imp.abs().sort_values(ascending=False).reset_index()
reg_feat_df.columns = ['特徵', '平均 SHAP 近似值']
fig_shap_bar = px.bar(
    reg_feat_df.head(20),
    x='平均 SHAP 近似值',
    y='特徵',
    orientation='h',
    title='📊 特徵重要性（模擬 SHAP）'
)
fig_shap_bar.update_layout(yaxis=dict(autorange="reversed"))

In [None]:
# ✅ Step 3.1：LightGBM Quantile Regression（預測死亡中位數）
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import plotly.express as px
import pandas as pd
import numpy as np

# 🔹 使用 df_dead（已經篩選過的死亡樣本）
X_qr = df_dead.drop(columns=['觀察天數', '死亡標記'], errors='ignore').astype('float32')
y_qr = df_dead['觀察天數']
X_train_qr, X_test_qr, y_train_qr, y_test_qr = train_test_split(X_qr, y_qr, test_size=0.2, random_state=42)

# ✅ Quantile Regression 模型（中位數 q=0.5）
quantile = 0.5
lgb_train = lgb.Dataset(X_train_qr, label=y_train_qr)
lgb_params = {
    'objective': 'quantile',
    'alpha': quantile,
    'metric': 'mae',
    'learning_rate': 0.1,
    'max_depth': 4,
    'num_leaves': 31,
    'verbosity': -1
}
model_qr = lgb.train(lgb_params, lgb_train, num_boost_round=100)

# ✅ 預測與評估
y_pred_qr = model_qr.predict(X_test_qr)
mae_qr = mean_absolute_error(y_test_qr, y_pred_qr)
r2_qr = r2_score(y_test_qr, y_pred_qr)

# ✅ 可視化預測結果（Plotly）
df_qr_result = pd.DataFrame({
    '實際死亡天數': y_test_qr,
    '預測死亡天數': y_pred_qr
})
fig_qr = px.scatter(
    df_qr_result,
    x='實際死亡天數',
    y='預測死亡天數',
    title=f'🔮 LightGBM Quantile Regression (q={quantile})',
    labels={'實際死亡天數': '實際天數', '預測死亡天數': '預測天數'}
)
fig_qr.add_shape(
    type='line', x0=0, y0=0, x1=180, y1=180,
    line=dict(color='red', dash='dash')
)

# ✅ 顯示指標與圖表
print(f"📌 Quantile Regression MAE: {mae_qr:.2f} 天")
print(f"📌 Quantile Regression R²: {r2_qr:.2f}")
fig_qr.show()

📌 Quantile Regression MAE: 35.68 天
📌 Quantile Regression R²: 0.13


In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# ✅ Step 1: 準備資料（死亡樣本）
X = df_dead.drop(columns=['觀察天數', '死亡標記'], errors='ignore').astype('float32')
y = df_dead['觀察天數']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Step 2: XGBoost 模型（log1p 回歸）
y_train_log = np.log1p(y_train)
xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train_log)
y_pred_xgb = np.expm1(xgb_model.predict(X_test))
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

# ✅ Step 3: LightGBM 模型（quantile 回歸）
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_params = {
    'objective': 'quantile',
    'alpha': 0.5,
    'metric': 'mae',
    'learning_rate': 0.1,
    'max_depth': 4,
    'num_leaves': 31,
    'verbosity': -1
}
lgb_model = lgb.train(lgb_params, lgb_train, num_boost_round=100)
y_pred_lgb = lgb_model.predict(X_test)
mae_lgb = mean_absolute_error(y_test, y_pred_lgb)
r2_lgb = r2_score(y_test, y_pred_lgb)

# ✅ Step 4: 繪圖比較
df_compare = pd.DataFrame({
    '實際天數': y_test,
    'XGBoost 預測': y_pred_xgb,
    'LightGBM 預測': y_pred_lgb
})

fig = px.scatter(
    df_compare,
    x='實際天數',
    y=['XGBoost 預測', 'LightGBM 預測'],
    labels={'value': '預測天數', 'variable': '模型'},
    title='📊 XGBoost vs LightGBM 預測結果比較'
)
fig.add_shape(type='line', x0=0, y0=0, x1=180, y1=180,
              line=dict(color='red', dash='dash'), name='y=x 參考線')

# ✅ Step 5: 結果總結
print("📌 XGBoost (log1p)")
print(f"   MAE: {mae_xgb:.2f} 天 | R²: {r2_xgb:.2f}")
print("📌 LightGBM (Quantile)")
print(f"   MAE: {mae_lgb:.2f} 天 | R²: {r2_lgb:.2f}")
fig.show()

📌 XGBoost (log1p)
   MAE: 36.25 天 | R²: 0.07
📌 LightGBM (Quantile)
   MAE: 35.68 天 | R²: 0.13


In [None]:
# ✅ trainingData_pipeline.py
# 完整流程：資料載入 → 特徵工程 → 區段分類 + 分段回歸（log）+ Quantile 回歸 + SHAP + 提早死亡預測分類 + 區間圖 + 特徵強化

import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import shap
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, r2_score, classification_report

# -----------------------------
# Step 1️⃣ 載入資料 + 特徵工程
# -----------------------------
print("📥 載入資料中...")
df = pd.read_csv("trainingData.csv", low_memory=False)

# ✅ 加入滑動變化與統計特徵（需 H01_NUM 存在）
if 'H01_NUM' in df.columns:
    for col in ['ADL_總分_max', 'GDS_總分_max', 'SOF_總分_max']:
        if col in df.columns:
            df[f'{col}_diff'] = df.groupby('H01_NUM')[col].diff().fillna(0)
            df[f'{col}_mean'] = df.groupby('H01_NUM')[col].transform('mean')
            df[f'{col}_std'] = df.groupby('H01_NUM')[col].transform('std').fillna(0)
else:
    print("⚠️ 無法做變化特徵：H01_NUM 不存在")

# -----------------------------
# Step 2️⃣ 篩選死亡樣本 + 分類標籤
# -----------------------------
df_death = df[df['死亡標記'] == 1].copy()
def classify_days(d):
    if d <= 30: return 0
    elif d <= 90: return 1
    else: return 2

df_death['死亡區段'] = df_death['觀察天數'].apply(classify_days)
df_death['label_30'] = (df_death['觀察天數'] <= 30).astype(int)
df_death['label_60'] = (df_death['觀察天數'] <= 60).astype(int)
df_death['label_90'] = (df_death['觀察天數'] <= 90).astype(int)

def bin_days(d):
    if d <= 30: return 0
    elif d <= 60: return 1
    elif d <= 90: return 2
    elif d <= 120: return 3
    else: return 4

df_death['死亡等級'] = df_death['觀察天數'].apply(bin_days).copy()
def classify_days(d):
    if d <= 30: return 0
    elif d <= 90: return 1
    else: return 2

df_death['死亡區段'] = df_death['觀察天數'].apply(classify_days)
df_death['label_30'] = (df_death['觀察天數'] <= 30).astype(int)  # 提早死亡標籤

# -----------------------------
# Step 3️⃣ 分類模型：預測死亡區段
# -----------------------------
X = df_death.drop(columns=['觀察天數', '死亡標記', '死亡區段', 'label_30'], errors='ignore')
X = X.select_dtypes(include=['number']).astype('float32')
y_class = df_death['死亡區段']
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_class, test_size=0.2, random_state=42)

clf = xgb.XGBClassifier(n_estimators=100, max_depth=4, learning_rate=0.1, random_state=42)
clf.fit(X_train_c, y_train_c)
pred_class = clf.predict(X_test_c)
class_acc = accuracy_score(y_test_c, pred_class)
print(f"📊 分類準確率（死亡區段）: {class_acc:.2%}")

# -----------------------------
# Step 3️⃣-2: 是否 30/60/90 天內死亡分類模型
# -----------------------------
for label in ['label_30', 'label_60', 'label_90']:
    y_bin = df_death[label]
    X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(X, y_bin, test_size=0.2, random_state=42)
    clf_bin = xgb.XGBClassifier(n_estimators=100, max_depth=4, learning_rate=0.1, random_state=42)
    clf_bin.fit(X_train_bin, y_train_bin)
    y_pred_bin = clf_bin.predict(X_test_bin)
    print(f"📊 是否在 {label[-2:]} 天內死亡分類報告:")
    print(classification_report(y_test_bin, y_pred_bin))
# -----------------------------
# Step 4️⃣ 分段回歸模型（log 預測）
# -----------------------------
results = []
for group in [0, 1, 2]:
    df_sub = df_death[df_death['死亡區段'] == group]
    X_sub = df_sub.drop(columns=['觀察天數', '死亡標記', '死亡區段', 'label_30'], errors='ignore')
    X_sub = X_sub.select_dtypes(include=['number']).astype('float32')
    y_sub = df_sub['觀察天數']
    y_sub_log = np.log1p(y_sub)
    X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_sub, y_sub_log, test_size=0.2, random_state=42)

    model = xgb.XGBRegressor(n_estimators=100, max_depth=4, learning_rate=0.1)
    model.fit(X_train_r, y_train_r)
    y_pred_log = model.predict(X_test_r)
    y_pred = np.expm1(y_pred_log)
    y_true = np.expm1(y_test_r)

    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    results.append({'死亡區段': group, 'MAE': mae, 'R²': r2})

# -----------------------------
# Step 5️⃣ Quantile 回歸 + 區間圖
# -----------------------------
quantiles = [0.1, 0.5, 0.9]
preds_q = {}
for q in quantiles:
    model = lgb.train({
        'objective': 'quantile', 'alpha': q,
        'metric': 'mae', 'learning_rate': 0.1,
        'max_depth': 4, 'verbosity': -1
    }, lgb.Dataset(X_train_r, label=y_train_r), num_boost_round=100)
    preds_q[q] = np.expm1(model.predict(X_test_r))

# Plotly 圖（區間優化版）
df_interval = pd.DataFrame({
    '實際': np.expm1(y_test_r),
    'P10 預測': preds_q[0.1],
    'P50 預測': preds_q[0.5],
    'P90 預測': preds_q[0.9]
})
fig_interval = px.scatter(df_interval, x='實際', y='P50 預測', title='P50 vs 實際死亡天數', labels={'P50 預測': '中位數預測'})
fig_interval.add_traces([
    px.line(df_interval, x='實際', y='P10 預測').update_traces(name='P10', line=dict(color='lightblue', dash='dot')).data[0],
    px.line(df_interval, x='實際', y='P90 預測').update_traces(name='P90', line=dict(color='lightblue', dash='dot')).data[0]
])
# 🔧 改用 go.Scatter 區間區塊避免 plotly.express.area 出錯
import plotly.graph_objects as go

fig_interval.add_trace(go.Scatter(
    x=df_interval.sort_values('實際')['實際'],
    y=df_interval.sort_values('實際')['P90 預測'],
    mode='lines',
    name='P90',
    line=dict(width=0),
    showlegend=False
))
fig_interval.add_trace(go.Scatter(
    x=df_interval.sort_values('實際')['實際'],
    y=df_interval.sort_values('實際')['P10 預測'],
    mode='lines',
    name='P10',
    fill='tonexty',
    fillcolor='rgba(173,216,230,0.3)',
    line=dict(width=0),
    showlegend=True
))
fig_interval.update_layout(
    legend_title_text='Quantile 區間',
    yaxis_title='預測天數',
    template='plotly_white'
)

# -----------------------------
# Step 6️⃣ SHAP 解釋
# -----------------------------
explainer = shap.Explainer(model, X_train_r)
shap_values = explainer(X_train_r)
shap_df = pd.DataFrame(shap_values.values, columns=X_train_r.columns)
mean_abs_shap = shap_df.abs().mean().sort_values(ascending=False).reset_index()
mean_abs_shap.columns = ['特徵', '平均 SHAP']
shap_fig = px.bar(mean_abs_shap.head(20), x='平均 SHAP', y='特徵', orientation='h',
                  title='📊 特徵重要性（SHAP - P50 模型）')
shap_fig.update_layout(yaxis=dict(autorange='reversed'))

📥 載入資料中...
📊 分類準確率（死亡區段）: 100.00%
📊 是否在 30 天內死亡分類報告:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1642
           1       1.00      1.00      1.00       991

    accuracy                           1.00      2633
   macro avg       1.00      1.00      1.00      2633
weighted avg       1.00      1.00      1.00      2633

📊 是否在 60 天內死亡分類報告:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1119
           1       1.00      1.00      1.00      1514

    accuracy                           1.00      2633
   macro avg       1.00      1.00      1.00      2633
weighted avg       1.00      1.00      1.00      2633

📊 是否在 90 天內死亡分類報告:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       735
           1       1.00      1.00      1.00      1898

    accuracy                           1.00      2633
   macro avg       1.00      1.00    

In [None]:
fig_interval.show()
shap_fig.show()

In [None]:
# -----------------------------
# Step 7️⃣ 輸出與展示
# -----------------------------
print("\n📌 分段回歸結果（log 預測 → 還原）:")
for res in results:
    block = {0: '0-30天', 1: '31-90天', 2: '91-180天'}.get(res['死亡區段'], '未知區段')
    print(f"🔹 區段 {res['死亡區段']}（{block}）: MAE = {res['MAE']:.2f}, R² = {res['R²']:.2f}")


📌 分段回歸結果（log 預測 → 還原）:
🔹 區段 0（0-30天）: MAE = 7.31, R² = -0.11
🔹 區段 1（31-90天）: MAE = 7.46, R² = 0.75
🔹 區段 2（91-180天）: MAE = 12.01, R² = 0.67
