<a href="https://colab.research.google.com/github/peculab/AI4JUBO/blob/main/JuboDeath_V4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install shap plotly xgboost --quiet

In [2]:
!pip uninstall shap -y
!pip install shap --no-deps

Found existing installation: shap 0.47.2
Uninstalling shap-0.47.2:
  Successfully uninstalled shap-0.47.2
Collecting shap
  Downloading shap-0.48.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Downloading shap-0.48.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: shap
Successfully installed shap-0.48.0


In [3]:
!pip install ace_tools

Collecting ace_tools
  Downloading ace_tools-0.0-py3-none-any.whl.metadata (300 bytes)
Downloading ace_tools-0.0-py3-none-any.whl (1.1 kB)
Installing collected packages: ace_tools
Successfully installed ace_tools-0.0


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from IPython.display import display
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, confusion_matrix, mean_absolute_error, r2_score
)

In [5]:
from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

In [6]:
# 前 13 個重要特徵
top_13_features = [
    '預估年齡', 'ADL_總分_max', '六個月內住院次數', '意識清醒_max', '使用呼吸輔具',
    '性別_is_male', 'CMS_value', 'SOF_總分_max', '體重變化_max', 'GDS_總分_max',
    '跌倒次數_std', '跌倒次數_max_1', '藥物數_max'
]

# 分析訓練資料的遺失值分布

In [7]:
def analyze_missing_values(df):
    import numpy as np
    import pandas as pd
    import warnings
    from IPython.display import display

    warnings.simplefilter(action='ignore', category=FutureWarning)

    print("📊 缺值分析報告與自動補值處理：\n")
    total_rows = len(df)

    # 自動將空白字串視為 NaN
    df = df.replace(r'^\s*$', np.nan, regex=True)

    # 建立報告列表
    missing_report = []

    # 特殊補值規則對照表（你可以在這裡擴充）
    special_fill_logic = {
        "CMS_value": "補眾數",
        "體重變化_max": "依性別與年齡區間分群補中位數，餘補均值",
        "SOF_總分_max": "CMS_value qcut 分群補中位數，餘補整體中位數",
        "GDS_總分_max": "依性別補中位數，餘補整體中位數",
        "ADL_總分_max": "補中位數",
        "預估年齡": "補中位數"
    }

    for col in df.columns:
        num_missing = df[col].isna().sum()
        percent_missing = (num_missing / total_rows) * 100
        dtype = df[col].dtype
        example_values = df[col].dropna().unique()[:5]

        fill_applied = None

        if col in special_fill_logic:
            # 根據補值邏輯自動執行補值
            if col == "CMS_value":
                mode_value = df[col].mode().dropna()
                if not mode_value.empty:
                    df[col] = df[col].fillna(mode_value[0])
                    fill_applied = "補眾數"

            elif col == "體重變化_max" and {"性別_is_male", "預估年齡"}.issubset(df.columns):
                df["年齡區間"] = pd.cut(df["預估年齡"], bins=[0, 65, 75, 85, 95, 150], labels=['<=65', '66-75', '76-85', '86-95', '>=96'])
                df[col] = df.groupby(["性別_is_male", "年齡區間"])[col].transform(lambda x: x.fillna(x.median()))
                df[col] = df[col].fillna(df[col].mean())
                df.drop(columns=["年齡區間"], inplace=True, errors='ignore')
                fill_applied = "依性別與年齡區間補中位數→補均值"

            elif col == "SOF_總分_max" and "CMS_value" in df.columns:
                try:
                    df["CMS_group"] = pd.qcut(df["CMS_value"], q=3, labels=["低", "中", "高"])
                    df[col] = df.groupby("CMS_group")[col].transform(lambda x: x.fillna(x.median()))
                except Exception as e:
                    print(f"⚠️ CMS 分群失敗：{e}")
                df[col] = df[col].fillna(df[col].median())
                df.drop(columns=["CMS_group"], inplace=True, errors='ignore')
                fill_applied = "CMS qcut 補中位數→補整體中位數"

            elif col == "GDS_總分_max" and "性別_is_male" in df.columns:
                df[col] = df.groupby("性別_is_male")[col].transform(lambda x: x.fillna(x.median()))
                df[col] = df[col].fillna(df[col].median())
                fill_applied = "性別分群補中位數→補整體中位數"

            elif col in ["ADL_總分_max", "預估年齡"]:
                df[col] = df[col].fillna(df[col].median())
                fill_applied = "補整體中位數"

        if num_missing > 0 or fill_applied:
            if np.issubdtype(dtype, np.number):
                method = "中位數 / 均值 / 分群補值 / 時序補值"
            else:
                method = "眾數 / 分群補值 / 自訂類別"

            missing_report.append({
                "欄位": col,
                "缺值數": num_missing,
                "缺值率 (%)": round(percent_missing, 2),
                "資料型別": str(dtype),
                "範例值": list(example_values),
                "建議補值方式": method,
                "實際補值策略": fill_applied if fill_applied else "未補值（手動決定）"
            })

    if missing_report:
        report_df = pd.DataFrame(missing_report)
        display(report_df.sort_values(by="缺值率 (%)", ascending=False).reset_index(drop=True))
        print("\n🔎 已套用特定邏輯的欄位已自動補值，其餘請依建議人工補值。\n")
    else:
        print("✅ 無缺值欄位。")

    return df

# 建立補值策略

In [19]:
def impute_features(df, mode='train', strategy: dict = None):
    """
    mode='train'：執行補值並回傳補值策略
    mode='apply'：根據 strategy 套用補值
    """
    df = df.copy()
    impute_dict = {}

    # 🧹 1. 空白字串視為 NaN
    df = df.replace(r'^\s*$', np.nan, regex=True)

    # ✅ 2. 數值欄轉 float
    float_cols = ['CMS_value', '體重變化_max', 'SOF_總分_max', '藥物數_max', 'GDS_總分_max', 'ADL_總分_max', '預估年齡']
    for col in float_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # ✅ 3. 可安全補 0 的邏輯欄位
    fill_zero_cols = ['意識清醒_max', '跌倒次數_std', '跌倒次數_max_1', '使用呼吸輔具', '使用管路', '活動假牙', '使用精神藥', '多重用藥', '六個月內住院次數']
    for col in fill_zero_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
            if set(df[col].dropna().unique()).issubset({0, 1}):
                df[col] = df[col].astype('category')

    if mode == 'train':
        # ➤ CMS_value 補眾數
        if 'CMS_value' in df.columns:
            impute_dict['CMS_value'] = df['CMS_value'].mode()[0]
            df['CMS_value'] = df['CMS_value'].fillna(impute_dict['CMS_value'])

        # ➤ ADL 與預估年齡補中位數
        impute_dict['ADL_總分_max'] = df['ADL_總分_max'].median()
        impute_dict['預估年齡'] = df['預估年齡'].median()
        df['ADL_總分_max'] = df['ADL_總分_max'].fillna(impute_dict['ADL_總分_max'])
        df['預估年齡'] = df['預估年齡'].fillna(impute_dict['預估年齡'])

        # ➤ GDS 分性別補值
        for gender in [0, 1]:
            impute_dict[f'GDS_總分_max_gender_{gender}'] = df[df['性別_is_male'] == gender]['GDS_總分_max'].median()
        impute_dict['GDS_總分_max_global'] = df['GDS_總分_max'].median()
        for gender in [0, 1]:
            idx = df['性別_is_male'] == gender
            df.loc[idx, 'GDS_總分_max'] = df.loc[idx, 'GDS_總分_max'].fillna(impute_dict[f'GDS_總分_max_gender_{gender}'])
        df['GDS_總分_max'] = df['GDS_總分_max'].fillna(impute_dict['GDS_總分_max_global'])

        # ➤ SOF 分 CMS 補中位數
        try:
            df['CMS_group'] = pd.qcut(df['CMS_value'], q=3, labels=False, duplicates='drop')
            for group in df['CMS_group'].dropna().unique():
                impute_dict[f'SOF_總分_max_group_{int(group)}'] = df[df['CMS_group'] == group]['SOF_總分_max'].median()
            df['SOF_總分_max'] = df.groupby('CMS_group')['SOF_總分_max'].transform(lambda x: x.fillna(x.median()))
            df.drop(columns='CMS_group', inplace=True)
        except Exception as e:
            print(f"⚠️ CMS_value 分群失敗：{e}")
        impute_dict['SOF_總分_max_global'] = df['SOF_總分_max'].median()
        df['SOF_總分_max'] = df['SOF_總分_max'].fillna(impute_dict['SOF_總分_max_global'])

        # ➤ 體重與藥物數：性別與年齡分群補中位數
        for col in ['體重變化_max', '藥物數_max']:
            for gender in [0, 1]:
                for age_bin in ['0-70', '70-80', '80-90', '90+']:
                    lo, hi = {'0-70': (0,70), '70-80': (70,80), '80-90': (80,90), '90+': (90,120)}[age_bin]
                    idx = (df['性別_is_male'] == gender) & (df['預估年齡'] > lo) & (df['預估年齡'] <= hi)
                    impute_dict[f'{col}_gender_{gender}_age_{age_bin}'] = df.loc[idx, col].median()
            impute_dict[f'{col}_global_mean'] = df[col].mean()
            # 實際補值
            for gender in [0, 1]:
                for age_bin in ['0-70', '70-80', '80-90', '90+']:
                    lo, hi = {'0-70': (0,70), '70-80': (70,80), '80-90': (80,90), '90+': (90,120)}[age_bin]
                    idx = (df['性別_is_male'] == gender) & (df['預估年齡'] > lo) & (df['預估年齡'] <= hi)
                    key = f'{col}_gender_{gender}_age_{age_bin}'
                    df.loc[idx, col] = df.loc[idx, col].fillna(impute_dict[key])
            df[col] = df[col].fillna(impute_dict[f'{col}_global_mean'])

    elif mode == 'apply' and strategy:
        # ➤ CMS
        df['CMS_value'] = df['CMS_value'].fillna(strategy['CMS_value'])
        df['ADL_總分_max'] = df['ADL_總分_max'].fillna(strategy['ADL_總分_max'])
        df['預估年齡'] = df['預估年齡'].fillna(strategy['預估年齡'])

        # ➤ GDS
        for gender in [0, 1]:
            idx = df['性別_is_male'] == gender
            df.loc[idx, 'GDS_總分_max'] = df.loc[idx, 'GDS_總分_max'].fillna(strategy[f'GDS_總分_max_gender_{gender}'])
        df['GDS_總分_max'] = df['GDS_總分_max'].fillna(strategy['GDS_總分_max_global'])

        # ➤ SOF
        try:
            df['CMS_group'] = pd.qcut(df['CMS_value'], q=3, labels=False, duplicates='drop')
            for group in df['CMS_group'].dropna().unique():
                key = f'SOF_總分_max_group_{int(group)}'
                df.loc[df['CMS_group'] == group, 'SOF_總分_max'] = df.loc[df['CMS_group'] == group, 'SOF_總分_max'] \
                    .fillna(strategy.get(key, np.nan))
            df.drop(columns='CMS_group', inplace=True)
        except:
            pass
        df['SOF_總分_max'] = df['SOF_總分_max'].fillna(strategy['SOF_總分_max_global'])

        # ➤ 體重 / 藥物數
        for col in ['體重變化_max', '藥物數_max']:
            for gender in [0, 1]:
                for age_bin in ['0-70', '70-80', '80-90', '90+']:
                    lo, hi = {'0-70': (0,70), '70-80': (70,80), '80-90': (80,90), '90+': (90,120)}[age_bin]
                    idx = (df['性別_is_male'] == gender) & (df['預估年齡'] > lo) & (df['預估年齡'] <= hi)
                    key = f'{col}_gender_{gender}_age_{age_bin}'
                    df.loc[idx, col] = df.loc[idx, col].fillna(strategy.get(key, np.nan))
            df[col] = df[col].fillna(strategy[f'{col}_global_mean'])

    # ✅ 9. 明確轉換類別型欄位
    categorical_binary_cols = ['性別_is_male', '使用呼吸輔具', '使用管路', '活動假牙', '使用精神藥', '多重用藥', '意識清醒_max']
    for col in categorical_binary_cols:
        if col in df.columns and df[col].nunique() <= 2:
            df[col] = df[col].astype("category")

    # 回傳
    if mode == 'train':
        return df, impute_dict
    else:
        return df

In [20]:
def apply_imputation(df, impute_dict):
    df = df.copy()

    # CMS
    if 'CMS_value' in df.columns:
        df['CMS_value'] = df['CMS_value'].fillna(impute_dict['CMS_value'])

    # ADL 與年齡
    df['ADL_總分_max'] = df['ADL_總分_max'].fillna(impute_dict['ADL_總分_max'])
    df['預估年齡'] = df['預估年齡'].fillna(impute_dict['預估年齡'])

    # GDS 分性別補中位數
    if '性別_is_male' in df.columns:
        for gender in [0, 1]:
            idx = df['性別_is_male'] == gender
            df.loc[idx, 'GDS_總分_max'] = df.loc[idx, 'GDS_總分_max'] \
                .fillna(impute_dict[f'GDS_總分_max_gender_{gender}'])
        df['GDS_總分_max'] = df['GDS_總分_max'].fillna(impute_dict['GDS_總分_max_global'])

    # SOF 分 CMS 群補中位數
    try:
        df['CMS_group'] = pd.qcut(df['CMS_value'], q=3, labels=False, duplicates='drop')
        for group in df['CMS_group'].dropna().unique():
            idx = df['CMS_group'] == group
            key = f'SOF_總分_max_group_{int(group)}'
            if key in impute_dict:
                df.loc[idx, 'SOF_總分_max'] = df.loc[idx, 'SOF_總分_max'].fillna(impute_dict[key])
    except:
        pass
    df['SOF_總分_max'] = df['SOF_總分_max'].fillna(impute_dict['SOF_總分_max_global'])

    # 體重變化 / 藥物數 → 性別與年齡分群中位數
    for col in ['體重變化_max', '藥物數_max']:
        for gender in [0, 1]:
            for age_bin in ['0-70', '70-80', '80-90', '90+']:
                age_range = {
                    '0-70': (0, 70),
                    '70-80': (70, 80),
                    '80-90': (80, 90),
                    '90+': (90, 120)
                }[age_bin]
                idx = (
                    (df['性別_is_male'] == gender) &
                    (df['預估年齡'] > age_range[0]) & (df['預估年齡'] <= age_range[1])
                )
                key = f'{col}_gender_{gender}_age_{age_bin}'
                df.loc[idx, col] = df.loc[idx, col].fillna(impute_dict.get(key, np.nan))
        df[col] = df[col].fillna(impute_dict[f'{col}_global_mean'])

    return df

# 載入外部資料

In [9]:
import pandas as pd
# read data and put it in a dataframe
# 在 google 工作表載入 gsheets
gsheets = gc.open_by_url('https://docs.google.com/spreadsheets/d/1Hoguf7PGhJoy0bGzIxf3P7yMHWHuKvdwyd4grxvRHkA/edit?usp=sharing')
worksheet = gsheets.worksheet("測試資料表")  # 指定分頁名稱
worksheet = worksheet.get_all_records()
external = pd.DataFrame(worksheet)
external = external.apply(lambda col: pd.to_numeric(col.astype(str).str.replace(',', '').str.strip(), errors='coerce'))
external.head()

Unnamed: 0,H01_NUM,dbname,死亡標記,觀察天數,H01_NUM_1,dbname_1,性別_is_male,預估年齡,DNR_flag,CDR_value,...,dbname_8,意識清醒_max,跌倒次數_max_1,跌倒次數_std,使用呼吸輔具,活動假牙,使用管路,H01_NUM_9,dbname_9,六個月內住院次數
0,1223,,0,180,1223,,0,75,0,,...,,0.0,,,1.0,0.0,1.0,,,
1,1224,,0,180,1224,,0,79,0,,...,,0.0,1.0,0.0,1.0,0.0,1.0,,,
2,1226,,0,180,1226,,0,83,0,,...,,0.0,1.0,0.0,1.0,0.0,1.0,,,
3,1227,,0,180,1227,,1,49,0,,...,,0.0,,,1.0,0.0,1.0,,,
4,1228,,0,180,1228,,0,84,0,,...,,0.0,,,1.0,0.0,1.0,,,


In [10]:
len(external)

640

In [11]:
ex_sub = external[top_13_features]
ex_y = external['死亡標記']
ex_sub.head()

Unnamed: 0,預估年齡,ADL_總分_max,六個月內住院次數,意識清醒_max,使用呼吸輔具,性別_is_male,CMS_value,SOF_總分_max,體重變化_max,GDS_總分_max,跌倒次數_std,跌倒次數_max_1,藥物數_max
0,75,70.0,,0.0,1.0,0,,1.0,3.0,0.0,,,10.0
1,79,20.0,,0.0,1.0,0,,1.0,3.0,1.0,0.0,1.0,2.0
2,83,0.0,,0.0,1.0,0,6.0,1.0,3.0,4.0,0.0,1.0,3.0
3,49,0.0,,0.0,1.0,1,,1.0,3.0,2.0,,,4.0
4,84,95.0,,0.0,1.0,0,,1.0,3.0,0.0,,,3.0


In [12]:
ex_sub.describe(include='all').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
預估年齡,640.0,77.7625,12.989456,19.0,70.0,80.0,87.0,103.0
ADL_總分_max,518.0,22.905405,26.054264,0.0,0.0,12.5,40.0,100.0
六個月內住院次數,253.0,1.403162,0.758229,1.0,1.0,1.0,2.0,5.0
意識清醒_max,500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
使用呼吸輔具,500.0,0.822,0.382896,0.0,1.0,1.0,1.0,1.0
性別_is_male,640.0,0.564063,0.496267,0.0,0.0,1.0,1.0,1.0
CMS_value,99.0,5.585859,1.767104,2.0,4.0,6.0,7.0,8.0
SOF_總分_max,333.0,0.963964,0.790698,0.0,0.0,1.0,1.0,3.0
體重變化_max,307.0,2.026059,1.022287,0.0,1.0,2.0,3.0,3.0
GDS_總分_max,315.0,2.647619,3.356658,0.0,0.0,1.0,5.0,14.0


In [13]:
analyze_missing_values(ex_sub)

📊 缺值分析報告與自動補值處理：

⚠️ CMS 分群失敗：Bin edges must be unique: Index([2.0, 8.0, 8.0, 8.0], dtype='float64', name='CMS_value').
You can drop duplicate edges by setting the 'duplicates' kwarg


Unnamed: 0,欄位,缺值數,缺值率 (%),資料型別,範例值,建議補值方式,實際補值策略
0,跌倒次數_std,615,96.09,float64,"[0.0, 0.707107]",中位數 / 均值 / 分群補值 / 時序補值,未補值（手動決定）
1,跌倒次數_max_1,542,84.69,float64,"[1.0, 3.0, 5.0, 2.0, 0.0]",中位數 / 均值 / 分群補值 / 時序補值,未補值（手動決定）
2,CMS_value,541,84.53,float64,"[6.0, 3.0, 2.0, 7.0, 4.0]",中位數 / 均值 / 分群補值 / 時序補值,補眾數
3,六個月內住院次數,387,60.47,float64,"[1.0, 2.0, 3.0, 4.0, 5.0]",中位數 / 均值 / 分群補值 / 時序補值,未補值（手動決定）
4,體重變化_max,333,52.03,float64,"[3.0, 0.0, 1.0, 2.0]",中位數 / 均值 / 分群補值 / 時序補值,依性別與年齡區間補中位數→補均值
5,GDS_總分_max,325,50.78,float64,"[0.0, 1.0, 4.0, 2.0, 3.0]",中位數 / 均值 / 分群補值 / 時序補值,性別分群補中位數→補整體中位數
6,藥物數_max,320,50.0,float64,"[10.0, 2.0, 3.0, 4.0, 15.0]",中位數 / 均值 / 分群補值 / 時序補值,未補值（手動決定）
7,SOF_總分_max,307,47.97,float64,"[1.0, 2.0, 3.0, 0.0]",中位數 / 均值 / 分群補值 / 時序補值,CMS qcut 補中位數→補整體中位數
8,意識清醒_max,140,21.88,float64,[0.0],中位數 / 均值 / 分群補值 / 時序補值,未補值（手動決定）
9,使用呼吸輔具,140,21.88,float64,"[1.0, 0.0]",中位數 / 均值 / 分群補值 / 時序補值,未補值（手動決定）



🔎 已套用特定邏輯的欄位已自動補值，其餘請依建議人工補值。



Unnamed: 0,預估年齡,ADL_總分_max,六個月內住院次數,意識清醒_max,使用呼吸輔具,性別_is_male,CMS_value,SOF_總分_max,體重變化_max,GDS_總分_max,跌倒次數_std,跌倒次數_max_1,藥物數_max
0,75,70.0,,0.0,1.0,0,8.0,1.0,3.0,0.0,,,10.0
1,79,20.0,,0.0,1.0,0,8.0,1.0,3.0,1.0,0.0,1.0,2.0
2,83,0.0,,0.0,1.0,0,6.0,1.0,3.0,4.0,0.0,1.0,3.0
3,49,0.0,,0.0,1.0,1,8.0,1.0,3.0,2.0,,,4.0
4,84,95.0,,0.0,1.0,0,8.0,1.0,3.0,0.0,,,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,88,70.0,1.0,0.0,1.0,1,8.0,1.0,3.0,1.0,,1.0,
636,68,10.0,1.0,0.0,1.0,0,8.0,1.0,3.0,1.0,,,7.0
637,68,0.0,,0.0,1.0,1,8.0,1.0,2.0,1.0,,,5.0
638,80,0.0,1.0,0.0,1.0,1,8.0,2.0,1.0,1.0,,1.0,2.0


# 載入訓練資料

In [14]:
import pandas as pd
import gspread
from oauth2client.service_account import ServiceAccountCredentials

sheet_url = "https://docs.google.com/spreadsheets/d/1p-7J-6lnpsi2i7a9z6ORktCZKXQGOw616GLVP7ynn7k/edit?usp=sharing"
sh = gc.open_by_url(sheet_url)
worksheet = sh.get_worksheet(0)

# 轉成 DataFrame
df = pd.DataFrame(worksheet.get_all_records())

# 🧹 空白字串 → NaN
df = df.replace(r'^\s*$', pd.NA, regex=True)

# ✅ 合理補 0 的欄位（請視實際資料是否存在這些欄位）
fill_zero_cols = [
    '意識清醒_max', '跌倒次數_std', '跌倒次數_max_1',
    '使用呼吸輔具', '使用管路', '活動假牙',
    '使用精神藥', '多重用藥', '六個月內住院次數', '藥物數_max'
]

# ⛑️ 自動補 0
for col in fill_zero_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        df[col] = df[col].fillna(0)

# ✔️ 確認結果
print("✅ 前處理完成（補 0 欄位處理完畢）")
df.head()

✅ 前處理完成（補 0 欄位處理完畢）


Unnamed: 0,H01_NUM,dbname,死亡標記,觀察天數,H01_NUM_1,dbname_1,性別_is_male,預估年齡,DNR_flag,CDR_value,...,dbname_8,意識清醒_max,跌倒次數_max_1,跌倒次數_std,使用呼吸輔具,活動假牙,使用管路,H01_NUM_9,dbname_9,六個月內住院次數
0,1116,C0985,1,178,1116,C0985,1,90,0,,...,,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0
1,1172,C0985,1,28,1172,C0985,1,73,0,,...,C0985,0.0,0.0,0.0,0.0,0.0,1.0,1172.0,C0985,1.0
2,1175,C0985,1,50,1175,C0985,0,91,0,,...,C0985,0.0,0.0,0.0,0.0,0.0,1.0,,,0.0
3,1177,C0985,1,74,1177,C0985,0,71,0,,...,C0985,0.0,0.0,0.0,0.0,0.0,1.0,,,0.0
4,1089,C0053,1,162,1089,C0053,1,87,0,,...,C0053,0.0,0.0,0.0,0.0,0.0,1.0,1089.0,C0053,1.0


In [15]:
df_sub = df[top_13_features]
y = df['死亡標記']
df_sub.head()

Unnamed: 0,預估年齡,ADL_總分_max,六個月內住院次數,意識清醒_max,使用呼吸輔具,性別_is_male,CMS_value,SOF_總分_max,體重變化_max,GDS_總分_max,跌倒次數_std,跌倒次數_max_1,藥物數_max
0,90,,0.0,0.0,0.0,1,,,,,0.0,0.0,0.0
1,73,,1.0,0.0,0.0,1,,,,,0.0,0.0,0.0
2,91,5.0,0.0,0.0,0.0,0,,,,0.0,0.0,0.0,0.0
3,71,10.0,0.0,0.0,0.0,0,,3.0,0.0,,0.0,0.0,1.0
4,87,20.0,1.0,0.0,0.0,1,,,,,0.0,0.0,0.0


In [16]:
df_sub.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
預估年齡,19969.0,118.0,84.0,815.0,,,,,,,
ADL_總分_max,18886.0,21.0,0.0,6366.0,,,,,,,
六個月內住院次數,26330.0,,,,0.902507,1.522849,0.0,0.0,0.0,1.0,31.0
意識清醒_max,26330.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
使用呼吸輔具,26330.0,,,,0.244132,0.42958,0.0,0.0,0.0,0.0,1.0
性別_is_male,26330.0,,,,0.532055,0.498981,0.0,0.0,1.0,1.0,1.0
CMS_value,3824.0,8.0,2.0,2360.0,,,,,,,
SOF_總分_max,5175.0,7.0,1.0,2272.0,,,,,,,
體重變化_max,4786.0,4.0,3.0,2756.0,,,,,,,
GDS_總分_max,12428.0,16.0,0.0,5589.0,,,,,,,


In [17]:
analyze_missing_values(df_sub)

📊 缺值分析報告與自動補值處理：

⚠️ CMS 分群失敗：Bin edges must be unique: Index([1.0, 2.0, 2.0, 8.0], dtype='float64', name='CMS_value').
You can drop duplicate edges by setting the 'duplicates' kwarg


Unnamed: 0,欄位,缺值數,缺值率 (%),資料型別,範例值,建議補值方式,實際補值策略
0,CMS_value,22506,85.48,object,"[8, 2, 5, 7, 3]",眾數 / 分群補值 / 自訂類別,補眾數
1,體重變化_max,21544,81.82,object,"[0, 3, 1, 2]",眾數 / 分群補值 / 自訂類別,依性別與年齡區間補中位數→補均值
2,SOF_總分_max,21155,80.35,object,"[3, 0, 2, 1, 4]",眾數 / 分群補值 / 自訂類別,CMS qcut 補中位數→補整體中位數
3,GDS_總分_max,13902,52.8,object,"[0, 8, 9, 5, 6]",眾數 / 分群補值 / 自訂類別,性別分群補中位數→補整體中位數
4,ADL_總分_max,7444,28.27,object,"[5, 10, 20, 0, 60]",眾數 / 分群補值 / 自訂類別,補整體中位數
5,預估年齡,6361,24.16,object,"[90, 73, 91, 71, 87]",眾數 / 分群補值 / 自訂類別,補整體中位數



🔎 已套用特定邏輯的欄位已自動補值，其餘請依建議人工補值。



Unnamed: 0,預估年齡,ADL_總分_max,六個月內住院次數,意識清醒_max,使用呼吸輔具,性別_is_male,CMS_value,SOF_總分_max,體重變化_max,GDS_總分_max,跌倒次數_std,跌倒次數_max_1,藥物數_max
0,90.0,15.0,0.0,0.0,0.0,1,2,1.0,1.0,1.0,0.0,0.0,0.0
1,73.0,15.0,1.0,0.0,0.0,1,2,1.0,2.0,1.0,0.0,0.0,0.0
2,91.0,5.0,0.0,0.0,0.0,0,2,1.0,2.0,0.0,0.0,0.0,0.0
3,71.0,10.0,0.0,0.0,0.0,0,2,3.0,0.0,2.0,0.0,0.0,1.0
4,87.0,20.0,1.0,0.0,0.0,1,2,1.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26325,89.0,0.0,1.0,0.0,1.0,1,2,1.0,1.0,1.0,0.0,1.0,0.0
26326,91.0,30.0,0.0,0.0,1.0,0,2,1.0,1.0,2.0,0.0,0.0,9.0
26327,81.0,60.0,0.0,0.0,1.0,1,2,1.0,3.0,1.0,0.0,0.0,8.0
26328,69.0,75.0,0.0,0.0,1.0,0,2,0.0,3.0,2.0,0.0,1.0,0.0


# 開始進行訓練

In [21]:
X, impute_strategy = impute_features(df_sub, mode='train')
y = df['死亡標記']

In [22]:
X.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
預估年齡,26330.0,,,,78.861679,12.638294,-2.0,75.0,81.0,85.0,125.0
ADL_總分_max,26330.0,,,,24.432207,27.789927,0.0,5.0,15.0,35.0,100.0
六個月內住院次數,26330.0,,,,0.902507,1.522849,0.0,0.0,0.0,1.0,31.0
意識清醒_max,26330.0,1.0,0.0,26330.0,,,,,,,
使用呼吸輔具,26330.0,2.0,0.0,19902.0,,,,,,,
性別_is_male,26330.0,2.0,1.0,14009.0,,,,,,,
CMS_value,26330.0,,,,2.187353,0.867954,1.0,2.0,2.0,2.0,8.0
SOF_總分_max,26330.0,,,,1.013635,0.473303,0.0,1.0,1.0,1.0,6.0
體重變化_max,26330.0,,,,2.344791,0.767208,0.0,2.0,3.0,3.0,3.0
GDS_總分_max,26330.0,,,,2.085568,2.457356,0.0,1.0,1.0,2.0,15.0


# XGBoost 的最佳化暴搜

In [23]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.datasets import make_classification
import pandas as pd

# Grid Search 參數
param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
}

# 初始化模型與交叉驗證器
xgb = XGBClassifier(eval_metric="logloss")  # ❗ 刪除 use_label_encoder
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Grid Search
grid = GridSearchCV(xgb, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=1)

for col in X.select_dtypes(include='category').columns:
    X[col] = X[col].astype(int)

grid.fit(X, y)

# 匯總結果
results = pd.DataFrame(grid.cv_results_)

# 找出最佳模型與參數
best_model = grid.best_estimator_
best_params = grid.best_params_
best_score = grid.best_score_

# 顯示前幾個最好的組合
top_results = results.sort_values(by='mean_test_score', ascending=False).head(10)
print("Top Grid Search Results:")
print(top_results[['mean_test_score', 'params']])

print("\nBest Parameters:", best_params)
print("Best ROC AUC Score:", best_score)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Top Grid Search Results:
    mean_test_score                                             params
28         0.857721  {'colsample_bytree': 0.8, 'learning_rate': 0.1...
18         0.857557  {'colsample_bytree': 0.8, 'learning_rate': 0.0...
64         0.857461  {'colsample_bytree': 1, 'learning_rate': 0.1, ...
26         0.857456  {'colsample_bytree': 0.8, 'learning_rate': 0.1...
29         0.857230  {'colsample_bytree': 0.8, 'learning_rate': 0.1...
14         0.857202  {'colsample_bytree': 0.8, 'learning_rate': 0.0...
19         0.857091  {'colsample_bytree': 0.8, 'learning_rate': 0.0...
62         0.856902  {'colsample_bytree': 1, 'learning_rate': 0.1, ...
10         0.856894  {'colsample_bytree': 0.8, 'learning_rate': 0.0...
16         0.856878  {'colsample_bytree': 0.8, 'learning_rate': 0.0...

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
Best ROC AU

In [24]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    roc_curve, auc, confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score
)
import numpy as np
import plotly.graph_objects as go
import pandas as pd
import plotly.figure_factory as ff

all_models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost (PecuLab)": XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1,
                              subsample=0.8, colsample_bytree=0.8,
                              random_state=42, eval_metric='logloss'),
    "Alrawi (2013)": LogisticRegression(max_iter=1000),
    "Chandra (2022)": XGBClassifier(n_estimators=200, max_depth=4, learning_rate=0.05,
                                    random_state=42, eval_metric='logloss'),
    "García-Gollarte (2020)": RandomForestClassifier(n_estimators=100, random_state=42),
    "Levy (2015)": LogisticRegression(max_iter=1000)
}

In [25]:
# Prepare CV and ROC Figure
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
fig_roc = go.Figure()
mean_fpr = np.linspace(0, 1, 100)

results = []

# For each model
for model_name, model in all_models.items():
    accs, precs, recalls, f1s, aucs = [], [], [], [], []
    all_cm = np.zeros((2, 2), dtype=int)
    tprs = []

    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]

        # ROC
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)
        tpr_interp = np.interp(mean_fpr, fpr, tpr)
        tpr_interp[0] = 0.0
        tprs.append(tpr_interp)
        aucs.append(roc_auc)

        # Metrics
        accs.append(accuracy_score(y_test, y_pred))
        precs.append(precision_score(y_test, y_pred))
        recalls.append(recall_score(y_test, y_pred))
        f1s.append(f1_score(y_test, y_pred))

        cm = confusion_matrix(y_test, y_pred)
        all_cm += cm

    # Mean ROC
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)

    fig_roc.add_trace(go.Scatter(
        x=mean_fpr, y=mean_tpr, mode='lines',
        name=f"{model_name} (mean AUC={mean_auc:.3f})"
    ))

    results.append({
        'Model': model_name,
        'Accuracy Mean': np.mean(accs),
        'Accuracy Std': np.std(accs),
        'Precision Mean': np.mean(precs),
        'Recall Mean': np.mean(recalls),
        'F1 Score Mean': np.mean(f1s),
        'ROC AUC Mean': np.mean(aucs),
        'ROC AUC Std': np.std(aucs),
        'Confusion Matrix': all_cm
    })

# Random baseline
fig_roc.add_trace(go.Scatter(
    x=[0, 1], y=[0, 1], mode='lines',
    line=dict(dash='dash'), name='Random Baseline'
))

fig_roc.update_layout(
    title="ROC Curve Comparison (Cross-Validation Mean)",
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate",
    width=800, height=600
)

In [26]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="ROC AUC Mean", ascending=False).reset_index(drop=True)
results_df

Unnamed: 0,Model,Accuracy Mean,Accuracy Std,Precision Mean,Recall Mean,F1 Score Mean,ROC AUC Mean,ROC AUC Std,Confusion Matrix
0,XGBoost (PecuLab),0.78929,0.00195,0.750998,0.865629,0.80423,0.857729,0.003089,"[[9386, 3779], [1769, 11396]]"
1,Chandra (2022),0.789632,0.001262,0.750148,0.868591,0.80502,0.856575,0.002824,"[[9356, 3809], [1730, 11435]]"
2,Random Forest,0.76354,0.001662,0.736686,0.820281,0.776227,0.82796,0.002409,"[[9305, 3860], [2366, 10799]]"
3,García-Gollarte (2020),0.76354,0.001662,0.736686,0.820281,0.776227,0.82796,0.002409,"[[9305, 3860], [2366, 10799]]"
4,Logistic Regression,0.679263,0.006386,0.662761,0.729966,0.694705,0.737087,0.005916,"[[8275, 4890], [3555, 9610]]"
5,Alrawi (2013),0.679263,0.006386,0.662761,0.729966,0.694705,0.737087,0.005916,"[[8275, 4890], [3555, 9610]]"
6,Levy (2015),0.679263,0.006386,0.662761,0.729966,0.694705,0.737087,0.005916,"[[8275, 4890], [3555, 9610]]"


In [27]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Step 1: 排序並取前四個模型
results_df = pd.DataFrame(results)
sorted_results = results_df.sort_values(by="ROC AUC Mean", ascending=False).reset_index(drop=True)
top4 = sorted_results.head(4)

# Step 2: 建立 2x2 subplot
fig_cm_all = make_subplots(
    rows=2, cols=2,
    subplot_titles=[f"{row['Model']}" for _, row in top4.iterrows()],
    horizontal_spacing=0.15,
    vertical_spacing=0.15
)

# Step 3: 加入每個混淆矩陣圖與數字
for i, row in top4.iterrows():
    cm = row['Confusion Matrix']
    z = cm
    text = [[str(cell) for cell in row] for row in z]  # 轉成文字標註

    heatmap = go.Heatmap(
        z=z,
        x=['預測 非死亡', '預測 死亡'],
        y=['真實 非死亡', '真實 死亡'],
        colorscale='Blues',
        showscale=False,
        text=text,
        texttemplate="%{text}",  # 顯示文字內容
        hoverinfo="z"  # 只顯示數值
    )

    row_pos = i // 2 + 1
    col_pos = i % 2 + 1
    fig_cm_all.add_trace(heatmap, row=row_pos, col=col_pos)

# Step 4: 更新整體格式
fig_cm_all.update_layout(
    title="🔍 前四名模型 - 混淆矩陣比較圖（含數值）",
    width=900,
    height=800
)

fig_cm_all.show()

In [28]:
import shap

# 訓練模型（完整資料）
xgb_model = all_models["XGBoost (PecuLab)"]
xgb_model.fit(X, y)

# 建立 SHAP 解釋器
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X)

# 計算平均 SHAP 值絕對值（作為重要性）
import numpy as np
shap_abs_mean = np.abs(shap_values).mean(axis=0)

In [29]:
import pandas as pd
import plotly.express as px

# 整理成 DataFrame 並排序
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Mean |SHAP Value|': shap_abs_mean
}).sort_values(by='Mean |SHAP Value|', ascending=False)

# 畫前 20 名
top_n = 20
fig_bar = px.bar(
    importance_df.head(top_n),
    x='Mean |SHAP Value|',
    y='Feature',
    orientation='h',
    title="🎯 Top SHAP Features by Mean |SHAP|",
)
fig_bar.update_layout(yaxis=dict(categoryorder='total ascending'))
fig_bar.show()

# 畫出 XGBoost (PecuLab) 訓練完之後的樹

In [30]:
booster = xgb_model.get_booster()
dump = booster.get_dump()
print(dump[0])  # 看第一棵樹的原始內容

0:[預估年齡<82] yes=1,no=2,missing=2
	1:[預估年齡<81] yes=3,no=4,missing=4
		3:[六個月內住院次數<1] yes=7,no=8,missing=8
			7:[ADL_總分_max<40] yes=15,no=16,missing=16
				15:[GDS_總分_max<2] yes=31,no=32,missing=32
					31:leaf=0.0517920256
					32:leaf=-0.0166785475
				16:[預估年齡<57] yes=33,no=34,missing=34
					33:leaf=-0.164444447
					34:leaf=-0.105703428
			8:[ADL_總分_max<35] yes=17,no=18,missing=18
				17:[六個月內住院次數<6] yes=35,no=36,missing=36
					35:leaf=0.117360331
					36:leaf=-0.0142857153
				18:[ADL_總分_max<85] yes=37,no=38,missing=38
					37:leaf=0.0531401001
					38:leaf=-0.0333333351
		4:[ADL_總分_max<20] yes=9,no=10,missing=10
			9:[性別_is_male<1] yes=19,no=20,missing=20
				19:[使用呼吸輔具<1] yes=39,no=40,missing=40
					39:leaf=-0.165591404
					40:leaf=-0.184973165
				20:[使用呼吸輔具<1] yes=41,no=42,missing=42
					41:leaf=-0.1268868
					42:leaf=-0.163819104
			10:[SOF_總分_max<6] yes=21,no=22,missing=22
				21:leaf=-0.189324096
				22:leaf=-0.0500000007
	2:[ADL_總分_max<35] yes=5,no=6,missing=6
		5:[六個月

In [31]:
import re
import plotly.graph_objects as go

def parse_tree_with_tooltips(tree_dump_lines):
    edges = []
    labels = {}
    tooltips = {}
    positions = {}
    x_level = {}

    for line in tree_dump_lines:
        depth = line.count('\t')
        line = line.strip()
        node_id = int(re.search(r'^(\d+):', line).group(1))

        if "leaf=" in line:
            val = float(re.search(r'leaf=([-0-9.eE]+)', line).group(1))
            label = f"Leaf"
            tooltip = f"Leaf value: {val:.4f}"
        else:
            feature = re.search(r'\[(.+?)<', line).group(1)
            threshold = re.search(r'<([0-9.eE+-]+)\]', line).group(1)
            yes = int(re.search(r'yes=(\d+)', line).group(1))
            no = int(re.search(r'no=(\d+)', line).group(1))
            label = f"{feature}<br><{threshold[:6]}"  # 短縮顯示
            tooltip = f"{feature} < {threshold}"
            edges.append((node_id, yes))
            edges.append((node_id, no))

        labels[node_id] = label
        tooltips[node_id] = tooltip

        x = depth
        y = x_level.get(x, 0)
        x_level[x] = y + 1
        positions[node_id] = (x, -y)

    return edges, labels, tooltips, positions

def plot_xgb_tree_optimized(model, tree_index=0):
    booster = model.get_booster()
    dump_lines = booster.get_dump()[tree_index].splitlines()

    edges, labels, tooltips, positions = parse_tree_with_tooltips(dump_lines)

    fig = go.Figure()

    for src, dst in edges:
        x0, y0 = positions[src]
        x1, y1 = positions[dst]
        fig.add_trace(go.Scatter(
            x=[x0, x1], y=[y0, y1],
            mode='lines',
            line=dict(width=1, color='gray'),
            hoverinfo='none',
            showlegend=False
        ))

    for node_id, (x, y) in positions.items():
        fig.add_trace(go.Scatter(
            x=[x], y=[y],
            mode='markers+text',
            marker=dict(size=30, color='lightblue'),
            text=[labels[node_id]],
            textposition="top center",
            hovertext=tooltips[node_id],
            hoverinfo='text',
            showlegend=False
        ))

    max_depth = max(x for x, _ in positions.values())
    fig.update_layout(
        title=f"🌳 XGBoost Tree {tree_index} 可讀決策樹圖（壓縮+說明）",
        xaxis=dict(showgrid=False, zeroline=False, visible=False),
        yaxis=dict(showgrid=False, zeroline=False, visible=False),
        margin=dict(t=60, l=40, r=40, b=20),
        height=150 + 150 * (max_depth + 1),
        plot_bgcolor="#f6f8fa"
    )
    fig.show()

In [32]:
plot_xgb_tree_optimized(xgb_model, tree_index=0)

# 追蹤 XGBoost 如何進行預測的過程

In [33]:
import re
import numpy as np
from xgboost import XGBClassifier

def trace_full_xgb_decision(model: XGBClassifier, X_row, feature_names=None, tree_limit=None):
    if feature_names is None:
        feature_names = X_row.columns.tolist()
    input_data = X_row.iloc[0].to_dict()

    booster = model.get_booster()
    tree_dumps = booster.get_dump(with_stats=True)
    n_trees = len(tree_dumps) if tree_limit is None else tree_limit
    print(f"🔍 分析樣本：{input_data}")
    print(f"🌲 模型共有 {len(tree_dumps)} 棵樹，將解析前 {n_trees} 棵")

    all_leaf_values = []
    for tree_idx in range(n_trees):
        lines = tree_dumps[tree_idx].splitlines()
        node_id = 0
        log = [f"\n🌳 第 {tree_idx} 棵樹"]
        while True:
            found_lines = [l for l in lines if re.match(rf"^{node_id}:", l)]
            if not found_lines:
                log.append(f"⚠️ 無法找到節點 {node_id}，跳過")
                break
            line = found_lines[0]

            if "leaf=" in line:
                leaf_val = float(re.search(r"leaf=([-0-9.eE]+)", line).group(1))
                log.append(f"👉 到達葉節點：leaf={leaf_val:.5f}")
                all_leaf_values.append(leaf_val)
                break
            else:
                feat_match = re.search(r"\[(.+?)<([-\d.eE]+)\]", line)
                yes = int(re.search(r"yes=(\d+)", line).group(1))
                no = int(re.search(r"no=(\d+)", line).group(1))

                feat_name = feat_match.group(1)
                threshold = float(feat_match.group(2))
                feat_val = input_data.get(feat_name, None)

                if feat_val is None:
                    log.append(f"⚠️ 找不到特徵 {feat_name} 的值，跳過")
                    break
                elif feat_val < threshold:
                    log.append(f"🔀 {feat_name}={feat_val} < {threshold} → 走 yes → {yes}")
                    node_id = yes
                else:
                    log.append(f"🔀 {feat_name}={feat_val} >= {threshold} → 走 no → {no}")
                    node_id = no
        for line in log:
            print(line)
    print(f"\n📊 所有 leaf 值加總後通過 sigmoid：")
    raw_score = np.sum(all_leaf_values)
    prob = 1 / (1 + np.exp(-raw_score))
    print(f"→ raw score: {raw_score:.5f}")
    print(f"→ probability (sigmoid): {prob:.5f}")
    return raw_score, prob

In [34]:
y.iloc[0]

np.int64(1)

In [35]:
xgb_model.predict(X.iloc[[0]])[0]

np.int64(1)

In [36]:
trace_full_xgb_decision(xgb_model, X.iloc[[0]])

🔍 分析樣本：{'預估年齡': 90.0, 'ADL_總分_max': 15.0, '六個月內住院次數': 0.0, '意識清醒_max': 0.0, '使用呼吸輔具': 0.0, '性別_is_male': 1.0, 'CMS_value': 2.0, 'SOF_總分_max': 1.0, '體重變化_max': 3.0, 'GDS_總分_max': 1.0, '跌倒次數_std': 0.0, '跌倒次數_max_1': 0.0, '藥物數_max': 0.0}
🌲 模型共有 100 棵樹，將解析前 100 棵

🌳 第 0 棵樹
🔀 預估年齡=90.0 >= 82.0 → 走 no → 2
⚠️ 無法找到節點 2，跳過

🌳 第 1 棵樹
🔀 ADL_總分_max=15.0 < 35.0 → 走 yes → 1
⚠️ 無法找到節點 1，跳過

🌳 第 2 棵樹
🔀 預估年齡=90.0 >= 82.0 → 走 no → 2
⚠️ 無法找到節點 2，跳過

🌳 第 3 棵樹
🔀 ADL_總分_max=15.0 < 25.0 → 走 yes → 1
⚠️ 無法找到節點 1，跳過

🌳 第 4 棵樹
🔀 預估年齡=90.0 >= 82.0 → 走 no → 2
⚠️ 無法找到節點 2，跳過

🌳 第 5 棵樹
🔀 ADL_總分_max=15.0 < 35.0 → 走 yes → 1
⚠️ 無法找到節點 1，跳過

🌳 第 6 棵樹
🔀 ADL_總分_max=15.0 < 35.0 → 走 yes → 1
⚠️ 無法找到節點 1，跳過

🌳 第 7 棵樹
🔀 預估年齡=90.0 >= 82.0 → 走 no → 2
⚠️ 無法找到節點 2，跳過

🌳 第 8 棵樹
🔀 ADL_總分_max=15.0 < 45.0 → 走 yes → 1
⚠️ 無法找到節點 1，跳過

🌳 第 9 棵樹
🔀 預估年齡=90.0 >= 82.0 → 走 no → 2
⚠️ 無法找到節點 2，跳過

🌳 第 10 棵樹
🔀 預估年齡=90.0 >= 82.0 → 走 no → 2
⚠️ 無法找到節點 2，跳過

🌳 第 11 棵樹
🔀 ADL_總分_max=15.0 < 20.0 → 走 yes → 1
⚠️ 無法找到節點 1，跳過

🌳 第 12 棵樹
🔀 預估年齡=90.0 >= 82.0

(np.float64(0.0), np.float64(0.5))

這位 90 歲男性住民，模型認為他有較高的死亡風險，主要原因為：

    ADL_總分_max = 15 → 嚴重失能

    GDS_總分_max = 1 → 較輕度憂鬱傾向但仍具風險

    住院次數 = 0 → 雖無急性住院史，但搭配年齡與失能程度，仍推高風險

    體重變化 = 3kg → 顯示近期身體狀況有異常波動

# 測試外部資料在 XGBoost 模型下的結果

In [44]:
ex_X = apply_imputation(ex_sub, impute_strategy)
ex_y = external['死亡標記']

In [45]:
ex_X[top_13_features].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
預估年齡,640.0,77.7625,12.989456,19.0,70.0,80.0,87.0,103.0
ADL_總分_max,640.0,21.398438,23.640612,0.0,0.0,15.0,35.0,100.0
六個月內住院次數,253.0,1.403162,0.758229,1.0,1.0,1.0,2.0,5.0
意識清醒_max,500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
使用呼吸輔具,500.0,0.822,0.382896,0.0,1.0,1.0,1.0,1.0
性別_is_male,640.0,0.564063,0.496267,0.0,0.0,1.0,1.0,1.0
CMS_value,640.0,2.554688,1.470676,2.0,2.0,2.0,2.0,8.0
SOF_總分_max,640.0,0.98125,0.570225,0.0,1.0,1.0,1.0,3.0
體重變化_max,640.0,2.098438,0.87461,0.0,1.0,2.0,3.0,3.0
GDS_總分_max,640.0,2.029688,2.456009,0.0,1.0,1.0,2.0,14.0


In [46]:
from sklearn.metrics import (
    confusion_matrix, roc_curve, roc_auc_score,
    accuracy_score, precision_score, recall_score, f1_score
)
import plotly.graph_objects as go
import numpy as np

def evaluate_xgb_model(model, X, y_true, model_name="XGBoost"):
    # ========= 預測 =========
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:, 1]

    # ========= 分數指標 =========
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_prob)

    print(f"📊 [{model_name}] 評估指標")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"AUC:       {auc:.4f}")
    print()

    # ========= 混淆矩陣圖 =========
    cm = confusion_matrix(y_true, y_pred)
    labels = ["Negative", "Positive"]

    fig_cm = go.Figure(data=go.Heatmap(
        z=cm,
        x=labels,
        y=labels,
        colorscale='Blues',
        text=cm,
        texttemplate="%{text}"
    ))

    fig_cm.update_layout(
        title=f"Confusion Matrix ({model_name})",
        xaxis_title="Predicted",
        yaxis_title="Actual",
        yaxis_autorange='reversed'
    )
    fig_cm.show()

    # ========= ROC 曲線圖 =========
    fpr, tpr, thresholds = roc_curve(y_true, y_prob)

    fig_roc = go.Figure()
    fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f"{model_name} (AUC={auc:.3f})"))
    fig_roc.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', line=dict(dash='dash'), name='Random'))

    fig_roc.update_layout(
        title="ROC Curve",
        xaxis_title="False Positive Rate",
        yaxis_title="True Positive Rate",
        width=600, height=500
    )
    fig_roc.show()

    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "auc": auc,
        "confusion_matrix": cm
    }

In [47]:
results = evaluate_xgb_model(xgb_model, ex_X[top_13_features], ex_y)

📊 [XGBoost] 評估指標
Accuracy:  0.6625
Precision: 0.7149
Recall:    0.5406
F1 Score:  0.6157
AUC:       0.7278

