# FIle Structure
- `origin_data`: From DB 的原始資料
- `origin_data_csv`: 原始資料轉檔為 `.csv`
- `logs`: 實驗輸出檔案資料夾
- `Z:` : 網路磁碟機(WFDB NAS)

In [1]:
ORIGIN_DATA = "origin_data"
DATA_CSV = "origin_data_csv"
LOGS = "logs"
MATCH = "Z:"
import pandas as pd

import numpy as np
import wfdb                                      # 讀取 WFDB header / record :contentReference[oaicite:4]{index=4}
from pathlib import Path                         # 物件導向檔案操作 :contentReference[oaicite:5]{index=5}
from datetime import datetime, timedelta, date, time
from tqdm import tqdm                            # 進度列（可省略）
import logging, os                               # 紀錄檔與系統路徑
from collections import defaultdict
import logging
import os
import ast
import re
from typing import List, Optional, Tuple,Set
from  tqdm import tqdm

def cross_validation_missing_subject(fileA: str | pd.DataFrame, fileB: str | pd.DataFrame, fileAname:str, fileBname:str):
    """
    列出
    - 哪些 subject id 存在於 fileA 但不存在於 fileB
    - 哪些 subject id 存在於 fileB 但不存在於 fileA
    並將結果列出

    Arg:
    - fileA: file path of fileA
    - fileB: file path of fileB
    """
    if not isinstance(fileA,pd.DataFrame):
        fileA_df = pd.read_csv(fileA)
    else:
        fileA_df = fileA

    if not isinstance(fileB,pd.DataFrame):
        fileB_df = pd.read_csv(fileB)
    else:
        fileB_df = fileB
        

    # 確認欄位名稱（假設欄位叫 SUBJECT_ID）
    if 'SUBJECT_ID' not in fileA_df.columns or 'SUBJECT_ID' not in fileB_df.columns:
        raise ValueError("Both files must contain 'SUBJECT_ID' column")

    # 轉換為集合
    setA = set(fileA_df['SUBJECT_ID'].dropna().astype(str))
    setB = set(fileB_df['SUBJECT_ID'].dropna().astype(str))

    # 找差集
    only_in_A = setA - setB
    only_in_B = setB - setA

    print(f"✅ SUBJECT_ID 存在於 {fileAname} 但不存在於 {fileBname}, 共 {len(only_in_A)}:")
    print(only_in_A if only_in_A else "無")
    
    print(f"\n✅ SUBJECT_ID 存在於 {fileBname} 但不存在於 {fileAname}共 {len(only_in_B)}:")
    print(only_in_B if only_in_B else "無")

    # 回傳結果（以 dict）
    return {
        "only_in_A": only_in_A,
        "only_in_B": only_in_B
    }

try:
    alive_yuran = pd.read_csv("./experiment_data_from_yuran/alive_42731_withHRV.csv")
    dead_yuran = pd.read_csv("./experiment_data_from_yuran/dead_42731_withHRV.csv")

    alive_set = set(alive_yuran['SUBJECT_ID'].to_list())
    dead_set = set(dead_yuran["SUBJECT_ID"].to_list())

    total_set = alive_set | dead_set
except Exception as e:
    print(e)



In [2]:
# ================================= 初始化紀錄 =================================

if os.path.isdir(LOGS):
    print(f"{LOGS} folder exist.")
else:
    os.makedirs("logs", exist_ok=True)
    print(f"{LOGS} folder doesn't exist, creating new {LOGS}")

# 建立 Logger
try:
    logger = logging.getLogger("data_clean")
    logger.setLevel(logging.WARNING)  # WARNING 以上都會被記錄

    # 建立 FileHandler，寫入 logs/clean.log
    fh = logging.FileHandler(f"{LOGS}/clean.log", mode="w", encoding="utf-8")
    # 只輸出訊息本身：SUBJECT_ID REASON
    formatter = logging.Formatter("%(message)s")
    fh.setFormatter(formatter)

    # 避免重複加入 handler
    if not logger.handlers:
        logger.addHandler(fh)
    print("Logger module create success.")
except Exception as e:
    raise ValueError(e)

def record_log(subject_id: str, reason: str):
    """
    將不合格的資料記錄到 logs/clean.log。
    例如： logger.warning("12345 invalid_date")
    """
    logger.warning(f"{subject_id} {reason}")

logs folder exist.
Logger module create success.


# Step 1 區分 Survivor Cohart / Deceased Cohart
- 具備 ICD Code 42731診斷碼與ICU紀錄的病患
    - "網站說 ECG 一定在 ICU 內"
    - "有可能沒有 ICU STAY 紀錄"
- 每個subject id 根據 ecg datime只取時間最晚的那一筆
    - 定義 `ecg datime只取時間最晚的那一筆`:
        - 如果 有多個紀錄的 WFDB header ，只取最後一筆
    - 然後ecg datetime一定要在「入出院」時間內，不是icu進出時間
        - 入院不管
        - 出院或死亡:容忍 30min(ICU OUTTIME)
- 根據 `ADMISSION`: hospital_expire_flag 區分死亡與存活
    - 死亡時間: Patient.DOD
        - 如果 DOD 缺失當作存活(by Yuran), 也有人直接篩掉(by Fish Yang)


In [2]:
# ================================= 篩選基本 Clinical Data: base =================================
def load_clinical_tables():
    """
    篩選 具備 ICD Code 42731診斷碼與ICU紀錄的病患
    Return:
    - base : pd.DataFrame, 患者綜合表格，包含以下資訊
        來自 patients.csv
        - SUBJECT_ID：患者唯一識別碼。

        admissions 表格
        - HADM_ID：住院請求編號，用於唯一識別一次住院。
        - ADMITTIME：病患入院時間（時間戳記）。
        - DISCHTIME：病患出院時間（時間戳記）。
        - HOSPITAL_EXPIRE_FLAG：出院時是否死亡（1 = 出院時已逝，0 = 否）。
        - DEATHTIME: 院內死亡時間

        icustays 表格
        - ICUSTAY_ID：ICU 住院期間的唯一識別符。
        - INTIME：進入 ICU 的時間戳記。
        - OUTTIME：離開 ICU 的時間戳記。

        diagnoses_icd 表格
        - ICD9_CODE：使用 ICD‑9 編碼系統記錄的診斷代碼（最多 6 位字元，包含空格，有些是 V 開頭代碼）
    """
    adm   = pd.read_csv(os.path.join(DATA_CSV,"ADMISSIONS.csv"), usecols=['SUBJECT_ID','HADM_ID',
                                                        'ADMITTIME','DISCHTIME','DEATHTIME',
                                                        'HOSPITAL_EXPIRE_FLAG'])

    icu   = pd.read_csv(os.path.join(DATA_CSV,"ICUSTAYS.csv"),
                        usecols=['SUBJECT_ID','ICUSTAY_ID','HADM_ID','INTIME','OUTTIME'])

    diag  = pd.read_csv(os.path.join(DATA_CSV,"DIAGNOSES_ICD.csv"),
                        usecols=['SUBJECT_ID','HADM_ID','ICD9_CODE'])
    # ICD9 42731 = AF
    # ── 1. 找出 AF 病人 ───────────────────────
    af_subjects = diag.loc[diag['ICD9_CODE']=='42731', 'SUBJECT_ID'].unique()

    # ── 2. 保留這些病人的全部 ICU stay ───────
    icu_af = icu[icu['SUBJECT_ID'].isin(af_subjects)]

    # ── 3. 加入 ADMISSIONS 資料 ───────────────
    base = icu_af.merge(adm, on=['SUBJECT_ID', 'HADM_ID'], how='left')

    # ── 4. (可選) 標記該次住院是否含 AF ──────
    af_flag = (diag[diag['ICD9_CODE']=='42731']
            [['SUBJECT_ID','HADM_ID']]
            .drop_duplicates()
            .assign(HAS_AF=1))
    base = base.merge(af_flag,
                    on=['SUBJECT_ID','HADM_ID'],
                    how='left') \
            .fillna({'HAS_AF': 0})

    # ── 5. 去重（若 diag 同 HADM_ID 多筆 42731）──
    base = base.drop_duplicates(subset=['ICUSTAY_ID'])

    # 刪除非 AF 患者
    base = base[base['HAS_AF'] == 1]

    # (New) 只保留最後一筆 ICU 紀錄
    print("只取每個病人最後一筆 ICU紀錄")
    # 先依 SUBJECT_ID 升序、INTIME 降序排序
    base = base.sort_values(by=['SUBJECT_ID', 'INTIME'], ascending=[True, False])
    # 對每個病人取第一筆（也就是最後一筆 ICU）
    base = base.drop_duplicates(subset='SUBJECT_ID', keep='first')

    # 儲存
    base.to_csv(os.path.join(LOGS, "base.csv"), index=False)
    return base

base = load_clinical_tables()

print("總資料: ",len(base))
print("總共多少 subject id",base['SUBJECT_ID'].nunique())

"""Analysis with Final"""
print(f"Missing ID (total {len(total_set - set(base['SUBJECT_ID'].to_list()))}): {total_set - set(base['SUBJECT_ID'].to_list())}")





只取每個病人最後一筆 ICU紀錄
總資料:  10252
總共多少 subject id 10252
Missing ID (total 0): set()


In [27]:
# ================================= 根據 base.csv 結果篩選 WFDB ECG 訊號: ecg_match.csv =================================

# Helper function
def parse_base_datetime(hdr):
    """
    處理 Header base time data type (適用於不同 wfdb 版本)
    Args:
    - hdr: wfdb.header 物件

    Return:
    - t0: 紀錄的起始時間, datetime 物件
    """
    if getattr(hdr, "base_datetime", None):           # v4.x 直接提供
        return hdr.base_datetime
    # base_date 可能已是 datetime.date
    bdate = hdr.base_date if isinstance(hdr.base_date, date) \
            else datetime.strptime(hdr.base_date, "%d/%m/%Y").date()
    # base_time 可能已是 datetime.time
    btime = hdr.base_time if isinstance(hdr.base_time, time) \
            else datetime.strptime(hdr.base_time.split(".")[0], "%H:%M:%S").time()
    return datetime.combine(bdate, btime)

def select_fitness_signal(base_series:pd.Series, filenames: List[str])->Optional[str]:
    """
    篩選該筆 ICU 紀錄時間是否有吻合的 ECG 訊號
    - Match Rule: ICUSTAY.INTIME in [base_datetime - 1hr , base_date_time + 1hr]

    Args:
    - base_series: pd.Series, including cols ['SUBJECT_ID','INTIME']
    - filenames: List[str], match subject id 的所有 WFDB Header
    """
    pattern = re.compile(r'-(\d{4})-(\d{2})-(\d{2})-(\d{2})-(\d{2})\.hea$')
    parsed = []
    
    for fn in filenames:
        m = pattern.search(fn)
        if m:
            dt = datetime.strptime(
                "-".join(m.groups()[:3]) + "-" +
                ":".join(m.groups()[3:]),
                "%Y-%m-%d-%H:%M"
            )
            parsed.append((fn, dt))
        else:
            print(f"file {fn} 日期解析錯誤")
    
    if not parsed:
        print(f"{filenames[0].split('-')[0]} : 沒有任何檔案被正確日期解析")
        return None
    

    intime = pd.to_datetime(base_series["INTIME"]) + pd.Timedelta(minutes=30)
    

    for fn,dt in parsed:
        # print(f"Current fn:{fn}")
        rec_path = os.path.join(MATCH,fn[0:3],fn.split("-")[0],fn.removesuffix(".hea"))
        hdr = wfdb.rdheader(rec_path, rd_segments=True)
        base_dt = parse_base_datetime(hdr)
       
        # ❶ 先算邊界
        lower = base_dt - pd.Timedelta(hours=1)
        upper = base_dt + pd.Timedelta(hours=1)
        # ❷ 再做 AND 比對（左右都加括號）
        if ((intime >= lower) & (intime <= upper)).any():
            return fn  # 直接回傳檔名字串
        
    return None






def select_latest_signal(filenames: List[str]) -> Optional[str]:
    """
    擷取一連串 WFDB Header 檔案字串做時間處理，並篩選出最晚一筆的 Header
    Args:
    - filenames: List[str], match subject id 的所有 WFDB Header

    Return:
    - last_header: str, WFDB 的最後一筆 Header 檔案名稱
    """
    pattern = re.compile(r'-(\d{4})-(\d{2})-(\d{2})-(\d{2})-(\d{2})\.hea$')
    parsed = []
    
    for fn in filenames:
        m = pattern.search(fn)
        if m:
            dt = datetime.strptime(
                "-".join(m.groups()[:3]) + "-" +
                ":".join(m.groups()[3:]),
                "%Y-%m-%d-%H:%M"
            )
            parsed.append((fn, dt))
        else:
            print(f"file {fn} 日期解析錯誤")
    
    if not parsed:
        print(f"{filenames[0].split('-')[0]} : 沒有任何檔案被正確日期解析")
        return None
    return max(parsed, key=lambda x: x[1])[0]

def last_lead2_segment_end(hdr, rec_dir: str) -> Optional[Tuple[datetime, float]]:
    """
    計算每一個 含有 Lead II segment 的 ECG 時間長度(in sec)，並以最後一個含 Lead II 的 segment 的結束作為整段ECG訊號的結束(即 T1)

    Args:
    - hdr: wfdb.rdheader() 回傳的 MultiRecordHeader 物件
    - rec_dir: 該 header 檔案所在資料夾 (Path，不含 .hea)

    Return
    - (T1, total_lead2_sec): 
        T1 為最後一個 Lead II segment 的結束 datetime
        total_lead2_sec 為所有 segment 中包含 Lead II 的總秒數
      若完全沒有 Lead II，回傳 None。
    """
    base_dt = parse_base_datetime(hdr)
    if base_dt is None:
        return None

    # 1) 取段名與長度（nsamp） —— 來源 hdr.seg_name / hdr.seg_len
    seg_names = hdr.seg_name           # list[str]
    seg_lens  = hdr.seg_len            # list[int]  :contentReference[oaicite:3]{index=3}
    assert len(seg_names) == len(seg_lens)

    # 2) 預先算 cumulative start index
    cum = [0]
    for L in seg_lens[:-1]:
        cum.append(cum[-1] + L)

    total_sec = 0.0
    last_end  = None

    # 3) 倒序掃描段
    for seg_name, start_idx, seg_len in reversed(list(zip(seg_names, cum, seg_lens))):
        if seg_name == "~":            # gap 段 :contentReference[oaicite:4]{index=4}
            continue
        sub_hdr = wfdb.rdheader(Path(rec_dir, seg_name))
        if "II" not in [n.upper().replace(" ", "") for n in sub_hdr.sig_name]:
            continue

        seg_sec = seg_len / sub_hdr.fs
        total_sec += seg_sec
        
        if last_end is None: #找到最後一段，先行紀錄
            seg_start_abs = base_dt + timedelta(seconds=start_idx / hdr.fs)
            last_end = seg_start_abs + timedelta(seconds=seg_sec)


    if last_end is None:
        return None
    return last_end, total_sec


def header_time_range(header_path:str)->Optional[Tuple[datetime,datetime,float]]:
    """
    計算 ECG訊號的起始時間，關於時間算法的界定如下:
    - T0: ECG 訊號起始時間，這裡採用 `header.base_dt`: 整個 multi-segment record 的起始時間，而不是某個 segment 的起始時間
        - 更精確做法應該採用 seg 內段落(待考慮，予以保留)
    - T1: ECG 訊號結束時間
        - 採用最後一筆 LEAD II seg 作為ECG訊號截止時間
        - 計算方式: 
            - seg_end = seg_start + (seg_length_in_seconds)
            - seg_length_in_seconds = sub_hdr.sig_len / sub_hdr.fs
    Arg:
    - header_path: WFDB header 的完整path : Z:/p00/p000085/p000085-2167-07-25-21-11.hea
    """
    rec_path   = Path(header_path).with_suffix("")    # 去掉 .hea
    hdr        = wfdb.rdheader(str(rec_path), rd_segments=True)
    if hdr.base_time is None or hdr.base_date is None:
        print(f"{rec_path.name}: missing base_date/time")
        return None

    t0 = parse_base_datetime(hdr)
    res = last_lead2_segment_end(hdr, rec_path.parent)
    if res is None:
        print(f"{rec_path.name}: 不存在 Lead II")
        return None

    t1, l2_sec = res
    return t0, t1, l2_sec

# main preprocsee pipeline
def match_last_ecg_signal(base:pd.DataFrame):
    """
    根據 subject id 搜尋是否有對應的 ECG 訊號，如果單一 subject id 對應多段 ECG singal(WFDB Header) 則只讀取最後一段

    Args:
    - base_subject_id: Set, base 篩選出來的 subject_id
    """
    record_df = pd.read_csv(os.path.join(DATA_CSV,"RECORDS.csv"),dtype={
        "SUBJECT_ID":int,
        "PATH":str
    })
    match_record_df = record_df[record_df['SUBJECT_ID'].isin(base['SUBJECT_ID'])]
    print(f"Total Exist Subject ID in MATCH/RECORD : {len(match_record_df)}")

    last_match_ecg = []
    last_t0 = []
    last_t1 = []
    total_l2_sec = []

    # 篩選 match 過後的 record 並找尋最後一筆 ECG Header

    for _, row in tqdm(match_record_df.iterrows(), desc="Total Subject id : ", unit=" id/s"):
        rec_paths = os.path.join(MATCH, row['PATH']) # rec_paths: Z://pxx//pxxnnnn
        
        
        wfdb_headers = [x for x in os.listdir(rec_paths) if row['PATH'].split("/")[1] in x and not x.split(".")[0].endswith("n")]
        if not wfdb_headers:
            print(f"{row['SUBJECT_ID']} 沒有符合規定的 header (包括結尾不含 n)")
            record_log(row['SUBJECT_ID'],"沒有符合規定的 header (包括結尾不含 n)")
            last_match_ecg.append(None)
            last_t0.append(None)
            last_t1.append(None)
            total_l2_sec.append(0)
            continue

        match_header = select_fitness_signal(base.loc[base['SUBJECT_ID']==row["SUBJECT_ID"]],wfdb_headers) 
        last_match_ecg.append(match_header)
    match_record_df['MATCH_ECG'] = last_match_ecg
    match_record_df.to_csv(os.path.join(LOGS, "test_ecg_match.csv"),index=False)
    return match_record_df
    """
        last_header = select_latest_signal(wfdb_headers)
        last_match_ecg.append(last_header)

        # 計算 T0/T1/Total Lead II secs
        result = None  # 初始化，避免未定義錯誤

        if last_header:
            result = header_time_range(Path(os.path.join(rec_paths,last_header)))
        else:
            record_log(row['SUBJECT_ID'],"沒有找到最後一筆的 Header")

        if result:
            last_t0.append(result[0])
            last_t1.append(result[1])
            total_l2_sec.append(result[2])
        else:
            last_t0.append(None)
            last_t1.append(None)
            total_l2_sec.append(0)
            record_log(row['SUBJECT_ID'],"所有的 segment record 都沒有 LEAD II ECG")
        
    match_record_df["LAST_ECG"] = last_match_ecg
    match_record_df['T0'] = last_t0
    match_record_df["T1"] = last_t1
    match_record_df['LEAD2_SEC'] = total_l2_sec

    match_record_df.to_csv(os.path.join(LOGS, "ecg_match.csv"),index=False)
    """

#match_record_df=match_last_ecg_signal(set(base['SUBJECT_ID'].to_list()))
match_record_df=match_last_ecg_signal(base)
match_record_df = match_record_df.dropna(axis=0)
print(total_set - set(match_record_df['SUBJECT_ID'].to_list()))


Total Exist Subject ID in MATCH/RECORD : 3028


Total Subject id : : 12 id/s [00:19,  1.31 id/s/s]

495 沒有符合規定的 header (包括結尾不含 n)
507 沒有符合規定的 header (包括結尾不含 n)


Total Subject id : : 21 id/s [00:33,  1.87s/ id/s]

700 沒有符合規定的 header (包括結尾不含 n)


Total Subject id : : 35 id/s [00:58,  2.17s/ id/s]

981 沒有符合規定的 header (包括結尾不含 n)
1006 沒有符合規定的 header (包括結尾不含 n)


Total Subject id : : 65 id/s [01:30,  1.47s/ id/s]

1613 沒有符合規定的 header (包括結尾不含 n)


Total Subject id : : 92 id/s [02:52,  4.09s/ id/s]

2265 沒有符合規定的 header (包括結尾不含 n)


Total Subject id : : 105 id/s [03:26,  3.75s/ id/s]

2619 沒有符合規定的 header (包括結尾不含 n)


Total Subject id : : 254 id/s [11:14,  4.62s/ id/s]

6519 沒有符合規定的 header (包括結尾不含 n)


Total Subject id : : 264 id/s [12:33,  6.06s/ id/s]

6659 沒有符合規定的 header (包括結尾不含 n)


Total Subject id : : 268 id/s [12:34,  2.08s/ id/s]

6749 沒有符合規定的 header (包括結尾不含 n)


Total Subject id : : 312 id/s [15:34,  2.99s/ id/s]


KeyboardInterrupt: 

In [4]:
# ================================= 篩選 ECG 時間段與匹配 Clinical Data : merge_subject_filtered/detail.csv =================================
#helper function
# 多補上 ECG 時間需要落在 入出院時間
def ecg_in_icu(base_row:pd.Series, ecg_match:pd.DataFrame) -> bool:
    """
    判斷 ECG record 時間是否在ICU離開時間:
    Yuran Rules
    - 不檢查 ECG 起點是否大於 ADMITTIME (預設 ECG 一定會落在內)
    - Range formula: 只要有任一筆 T1 ≤ OUTTIME + 30 分鐘，回傳 True
    Args:
    - base_row : pandas.Series
        必須含下列欄位：
        'SUBJECT_ID' : int
        'INTIME'  : str 或 datetime  (ICU 進入時間，可不使用)
        'OUTTIME'  : str 或 datetime  (ICU 離開時間)
    - ecg_match : pandas.DataFrame
        欄位必須包含：
        ['SUBJECT_ID', 'T0', 'T1']

    Returns
    - bool
        - True  → 至少有一筆 ECG 結束時間 T1 在（OUTTIME + 30 分鐘）之前
        - False → 所有 ECG 訊號都超出此時間，或根本沒有 ECG 記錄（並已記錄 log）
    """
    # --- 1. 解析ICU期間 --------------------------------------
    sid        = int(base_row["SUBJECT_ID"])
    icuout_time = pd.to_datetime(base_row["OUTTIME"]) + pd.Timedelta(minutes=30)


    # --- 2. 擷取該病人所有 ECG 記錄 --------------------------
    ecg = ecg_match[ecg_match.SUBJECT_ID == sid]
    if ecg.empty:
        record_log(sid,"在檢查 ecg_in_icu 時沒有 ECG signal")
        return False   # 此病人沒有波形

    if ecg["T0"].dtype == "object":
        ecg = ecg.assign(T0=pd.to_datetime(ecg["T0"]),
                         T1=pd.to_datetime(ecg["T1"]))
        
    # --- 3. 檢查是否處於 ICU 離院前 ------------------------------
    # 條件：  (T1 <= DISCHTIME)
    in_window = (ecg["T1"] <= icuout_time).any()

    return bool(in_window)


def ecg_in_admission(base_row:pd.Series, ecg_match:pd.DataFrame) -> bool:
    """
    判斷 ECG record 時間是否完全落於該次入院/出院時間:
    
    - Range formula: [T0, T1] ⊆ [ADMITTIME-30min, DISCHTIME] 
    
    Args:
    - base_row : pandas.Series
        必須含下列欄位：
        'SUBJECT_ID' : int
        'ADMITTIME'  : str 或 datetime  (住院 進入時間)
        'DISCHTIME'  : str 或 datetime  (出院 離開時間)
    - match_ecg_row : pandas.DataFrame
        欄位必須包含：
        ['SUBJECT_ID', 'T0', 'T1']

    Returns
    - bool
        True  → ECG 訊號落在允許出入院期間
        False → 完全無交集
    """
    # --- 1. 解析住院期間 --------------------------------------
    sid        = int(base_row["SUBJECT_ID"])
    admit_time = pd.to_datetime(base_row["ADMITTIME"]) - pd.Timedelta(minutes=30)
    disch_time = pd.to_datetime(base_row["DISCHTIME"])

    # --- 2. 擷取該病人所有 ECG 記錄 --------------------------
    ecg = ecg_match[ecg_match.SUBJECT_ID == sid]
    if ecg.empty:
        return False   # 此病人沒有波形

    if ecg["T0"].dtype == "object":
        ecg = ecg.assign(T0=pd.to_datetime(ecg["T0"]),
                         T1=pd.to_datetime(ecg["T1"]))
    # --- 3. 檢查是否「完整包含」 ------------------------------
    # 條件： (T0 >= ADMITTIME-30min) AND (T1 <= DISCHTIME)
    in_window = (
        (ecg["T0"] >= admit_time) &
        (ecg["T1"] <= disch_time)
    ).any()

    return bool(in_window)

def ecg_overlaps_icu(base_row:pd.Series, ecg_match:pd.DataFrame) -> bool:
    """
    判斷單一 ICU 住院是否有任何 ECG record 與 ICU 期間重疊
    
    Args:
    - base_row : pandas.Series
        必須含下列欄位：
        'SUBJECT_ID' : int
        'INTIME'     : str 或 datetime  (ICU 進入時間)
        'OUTTIME'    : str 或 datetime  (ICU 離開時間)
        'ADMITTIME'  : str 或 datetime  (住院 進入時間)
        'DISCHTIME'  : str 或 datetime  (出院 離開時間)
    - match_ecg_row : pandas.DataFrame
        欄位必須包含：
        ['SUBJECT_ID', 'T0', 'T1']

    Returns
    - bool
        True  → ICU 期間與任一 ECG (T0,T1) 有交集
        False → 完全無交集
    """
    # --- 1. 解析 ICU 期間 --------------------------------------
    sid   = int(base_row['SUBJECT_ID'])
    intime  = pd.to_datetime(base_row['INTIME'])
    outtime = pd.to_datetime(base_row['OUTTIME'])

    # --- 2. 擷取該病人的所有 ECG 記錄 --------------------------
    ecg = ecg_match[ecg_match.SUBJECT_ID == sid]
    if ecg.empty:
        return False   # 無任何波形
    
    if ecg["T0"].dtype == "object":
        ecg = ecg.assign(T0=pd.to_datetime(ecg["T0"]),
                         T1=pd.to_datetime(ecg["T1"]))

    # --- 3. 檢查是否重疊 --------------------------------------
    # 條件： max(T0, INTIME) < min(T1, OUTTIME)
    overlaps = (
        (ecg['T0'].clip(lower=intime) <
         ecg['T1'].clip(upper=outtime))
        .any()
    )
    return bool(overlaps)
# main processing pipeline
def match_ecg_clinic(base:pd.DataFrame, ecg_match:pd.DataFrame)->pd.DataFrame:
    """
    匹配 ECG 訊號屬於 SUBJECT ID 中哪一次的 HADM_ID -> ICUSTAY_ID, 先以 T0,T1 有沒有與ICU.INTIME/OUTTIME Overlap
    """
    ecg_match_drop = ecg_match.dropna()
    print(f"Total ECG Record : {len(ecg_match_drop)}")
    merge_subject_detail = base.copy()
    merge_subject_detail['ECG_OVERLAP'] = base.apply(ecg_overlaps_icu, axis=1, ecg_match =ecg_match_drop )
    merge_subject_filterd = merge_subject_detail[merge_subject_detail['ECG_OVERLAP']]

    print(f"After Matching, total data : {len(merge_subject_filterd)}, Total Subject ID: {merge_subject_filterd['SUBJECT_ID'].nunique()}")
    merge_subject_filterd.to_csv(os.path.join(LOGS,"merge_test.csv"),index=False)
    return merge_subject_filterd

def merge_ecg_clinic(base:pd.DataFrame, ecg_match:pd.DataFrame)->pd.DataFrame:
    """
    - 匹配 ECG 訊號屬於 SUBJECT ID 中哪一次的 HADM_ID -> ICUSTAY_ID
        - 以時間段匹配，ECG的 T0,T1必須介於該次出入院內
    - 流程
        - 移除 `ecg_match` 中不存在T0,T1的資料(保留方便複查)
        - 匹配 ICU_STAY_TIME 與 T0,T1 是否有 Overlap
        - 移除不匹配的紀錄
        - 進一步檢查 T0有沒有
    
    Args:
    - base: pd.Dataframe, made from function load_clinical_tables()
    - ecg_match: pd.Dataframe, made from function match_last_ecg_signal()

    Return:
    """
    ecg_match_drop = ecg_match.dropna()
    print(f"Total ECG Record : {len(ecg_match_drop)}")

    merge_subject_detail = base.copy()
    merge_subject_detail['ECG_OVERLAP'] = base.apply(ecg_overlaps_icu, axis=1, ecg_match =ecg_match_drop )

    # 改用嚴格匹配 ICU 離開時間: ecg_in_admission -> replace with ecg_in_icu
    # merge_subject_detail['ECG_IN_ADMISSION'] = base.apply(ecg_in_admission, axis=1, ecg_match =ecg_match_drop)
    merge_subject_detail['ECG_IN_ICU'] = base.apply(ecg_in_icu, axis=1, ecg_match =ecg_match_drop)

    merge_subject_detail.to_csv(os.path.join(LOGS,"merge_subject_detail.csv"),index= False)

    merge_subject_filterd = merge_subject_detail[merge_subject_detail['ECG_IN_ICU'] & merge_subject_detail['ECG_OVERLAP']]
    merge_subject_filterd.to_csv(os.path.join(LOGS,"merge_subject_filtered.csv"),index= False)

    # 多筆 subject id 通過條件分析
    counts = merge_subject_filterd["SUBJECT_ID"].value_counts()
    multi = counts[counts > 1].index.tolist()
    print("同一個 subject_id 出現超過一次：", multi)
    
    return merge_subject_filterd

ecg_match = pd.read_csv(os.path.join(LOGS,"ecg_match_info.csv"))
# merge_subject_filterd = merge_ecg_clinic(base,ecg_match)

"""先進行 Overlap 分析結果"""
merge_subject_filterd = match_ecg_clinic(base,ecg_match)
alive_set = set(alive_yuran['SUBJECT_ID'].to_list())
dead_set =  set(dead_yuran['SUBJECT_ID'].to_list())
print(alive_set&dead_set)

total_subject_yuran = set(alive_yuran['SUBJECT_ID'].to_list()) | set(dead_yuran['SUBJECT_ID'].to_list())
print(f"Yuran Alive Missing (total: {len(alive_set-set(merge_subject_filterd['SUBJECT_ID'].to_list()))}) : \n{alive_set-set(merge_subject_filterd['SUBJECT_ID'].to_list())}")
print(f"Yuran dead Missing (total: {len(dead_set-set(merge_subject_filterd['SUBJECT_ID'].to_list()))}) : \n{dead_set-set(merge_subject_filterd['SUBJECT_ID'].to_list())}")





Total ECG Record : 6632
After Matching, total data : 2238, Total Subject ID: 2238
set()
Yuran Alive Missing (total: 0) : 
set()
Yuran dead Missing (total: 0) : 
set()


In [16]:
from typing import Tuple
merge_subject_filterd = pd.read_csv("./logs/merge_subject_filtered.csv")
def split_groups(df:pd.DataFrame)->Tuple[pd.DataFrame,pd.DataFrame]:
    """
    Based on the HOSPITAL_EXPIRE_FLAG to split the group surv/mort
    """
    surv  = df[df['HOSPITAL_EXPIRE_FLAG']==0].copy()
    mort  = df[df['HOSPITAL_EXPIRE_FLAG']==1].copy()
    return surv, mort
surv,mort = split_groups(merge_subject_filterd)
print("========  Stage 1 Result ==============")
print(f"Survior Cohort: Number of Data {len(surv)}; Total Subject ID: {surv['SUBJECT_ID'].nunique()}")
print(f"Deceased Cohort: Number of Data {len(mort)}; Total Subject ID: {mort['SUBJECT_ID'].nunique()}")
alive = pd.read_csv("./experiment_data_from_yuran/alive_42731_withHRV.csv")
dead = pd.read_csv("./experiment_data_from_yuran/dead_42731_withHRV.csv")
surv_cross = cross_validation_missing_subject(surv,alive, "My Surv","Yuran Surv")
dead_cross = cross_validation_missing_subject(mort,dead,"My Dead","Yuran Dead")
print(mort.info())

Survior Cohort: Number of Data 1775; Total Subject ID: 1775
Deceased Cohort: Number of Data 398; Total Subject ID: 398
✅ SUBJECT_ID 存在於 My Surv 但不存在於 Yuran Surv, 共 1444:
{'73190', '10241', '83272', '63512', '24925', '61956', '21071', '84775', '17412', '79352', '71652', '41882', '32247', '4490', '46467', '69776', '93602', '18910', '51642', '65370', '25452', '15583', '75034', '15079', '59102', '81349', '50643', '24693', '76844', '66311', '95948', '18614', '89459', '84150', '14298', '20795', '53355', '43738', '9486', '3695', '15610', '58022', '93479', '69871', '90814', '98961', '57511', '98555', '1528', '9274', '93159', '94768', '6116', '60272', '26380', '76801', '19125', '18254', '71596', '63525', '76797', '72999', '67087', '43296', '18837', '65404', '15538', '54675', '89556', '84206', '41204', '29664', '63961', '89012', '23510', '99560', '98494', '85', '15198', '14532', '63733', '54174', '59225', '29215', '98280', '42721', '72151', '64137', '86381', '80286', '15021', '75607', '31633', '

# Step 2 : Survivor / Deceased Cohort Exclusion
## Deceased Cohort Exclusion Criteria
- Post-mortem ECG present
    - T1 < ADMISSION.deathtime
        - Issue: if ADMISSION.deathtime missed, filling with ADMISSION.DISCHTIME // Revised: 直接過濾掉
- Age > 125 year
    - age計算 : PATIENTS.dob + ADMISSION.admittime(入院時間)
- no Continuous Lead II ECG >= 10 hr before death
    - Total Signal Length >= 36000 sec

In [18]:
import numpy as np
from dateutil.relativedelta import relativedelta

def merge_age_by_admit_mort(mort: pd.DataFrame) -> pd.DataFrame:
    """
    將 Stage 1 Deceased Cohort 引入 patient.dob(病患出生日期)，並計算入院時年紀
    - Issue: admissions.deathtime(死亡日期)缺失則以ICU離開時間填補(Issue: ICU OUTTIME 比 ADMISSIONS.DEATHTIME 晚 WTF)
        - 0728:Update: 應該要直接排除

    Args:
    - mort : pd.DataFrame, Stage 1 Deceased Cohort

    Return:
    - mort_with_Age_Death: pd.DataFrame, 新增欄位 ["DOB","AGE_AT_ADMISSION_YYMMDDHHMMSS"]
    """
    try:
        pat = pd.read_csv(os.path.join(DATA_CSV, "PATIENTS.csv"),
                          usecols=['SUBJECT_ID', 'DOB'])
        # adm = pd.read_csv(os.path.join(DATA_CSV, "ADMISSIONS.csv"),
        #                   usecols=['SUBJECT_ID','HADM_ID' ,'DEATHTIME'])
    except FileNotFoundError:
        raise ValueError(f"Required CSV files not found in {DATA_CSV}")
    
    # 2. 轉成 pandas 的 datetime64（先做 basic 清理，coerce 會把壞格式設為 NaT）
    pat['DOB']        = pd.to_datetime(pat['DOB'], errors='coerce')
    mort['ADMITTIME']  = pd.to_datetime(mort['ADMITTIME'], errors='coerce')
    # adm['DEATHTIME']  = pd.to_datetime(adm['DEATHTIME'], errors='coerce')
    
    # 3. 合併資料（mort 為基準，用 left join）
    mort_with_Age_Death = (
        mort
        .merge(pat, on="SUBJECT_ID", how="left")                # PATIENTS 仍僅用 SUBJECT_ID
        # .merge(                                                  # ADMISSIONS 用雙欄位對齊
        #     adm[['SUBJECT_ID', 'HADM_ID', 'DEATHTIME']],        # 先保留要用的欄位
        #     on=["SUBJECT_ID", "HADM_ID"],                       # 這裡傳 list
        #     how="left"
        # )
    )
    print(f'After Merge Total Data: {len(mort_with_Age_Death)}')
    print(f'After merge SUBJECT ID: {mort_with_Age_Death["SUBJECT_ID"].nunique()}')
    
    # 4. 定義逐列計算 age 的函式
    def calc_age_ymdhms(row):
        dob   = row['DOB']
        admit = row['ADMITTIME']
        # 如果有任一是 NaT，就回傳 NaN
        if pd.isna(dob) or pd.isna(admit):
            return np.nan
        # 用 python datetime 計算差分
        rd = relativedelta(admit.to_pydatetime(), dob.to_pydatetime())
        # 格式化：YYYYMMDDHHMMSS
        return (
            f"{rd.years:04d}"
            f"{rd.months:02d}"
            f"{rd.days:02d}"
            f"{rd.hours:02d}"
            f"{rd.minutes:02d}"
            f"{rd.seconds:02d}"
        )
    
    # 5. 套用到整張表
    mort_with_Age_Death['AGE_AT_ADMISSION_YYMMDDHHMMSS'] = mort_with_Age_Death.apply(calc_age_ymdhms, axis=1)
    
    if mort_with_Age_Death['DEATHTIME'].isnull().any():
        # print("DEATHTIME 欄位檢測到缺失，以離開ICU時間做填補")
        # mort_with_Age_Death['DEATHTIME'] = mort_with_Age_Death['DEATHTIME'].fillna(mort_with_Age_Death['OUTTIME'])
        print("DEATHTIME 欄位檢測到缺失，移除缺失欄位")
        mort_with_Age_Death= mort_with_Age_Death[~mort_with_Age_Death['DEATHTIME'].isnull()]

    mort_with_Age_Death.to_csv(os.path.join(LOGS,"test.csv"),index= False)
    print(f'After Saving Total Data: {len(mort_with_Age_Death)}')
    print(f'After Saving SUBJECT ID: {mort_with_Age_Death["SUBJECT_ID"].nunique()}')
    print(mort_with_Age_Death.info())
    return mort_with_Age_Death

print(f'Total Data: {len(mort)}')
print(f'Total SUBJECT ID: {mort["SUBJECT_ID"].nunique()}')
mort_with_Age_Death = merge_age_by_admit_mort(mort)

print(f'Total Data: {len(mort_with_Age_Death)}')
print(f'Total SUBJECT ID: {mort_with_Age_Death["SUBJECT_ID"].nunique()}')

Total Data: 398
Total SUBJECT ID: 398
After Merge Total Data: 398
After merge SUBJECT ID: 398
After Saving Total Data: 398
After Saving SUBJECT ID: 398
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   SUBJECT_ID                     398 non-null    int64         
 1   HADM_ID                        398 non-null    int64         
 2   ADMITTIME                      398 non-null    datetime64[ns]
 3   DISCHTIME                      398 non-null    object        
 4   DEATHTIME                      398 non-null    object        
 5   HOSPITAL_EXPIRE_FLAG           398 non-null    int64         
 6   ICUSTAY_ID                     398 non-null    int64         
 7   INTIME                         398 non-null    object        
 8   OUTTIME                        398 non-null    object        
 9   I

In [19]:
def filter_age_l2Time_postECG(mort_with_Age_Death:pd.DataFrame)->pd.DataFrame:
    """
    合併 ecg_match.csv (需要包含[T0,T1,LEAD2_SEC]等欄位)，並依照指定條件做篩選
    Exclusion Criteria
    - Age > 125 year
    - Post ECG : T1 > DEATHTIME
    - LEAD2_SEC < 36000 sec
    """
    try:
        ecg_match = pd.read_csv(os.path.join(LOGS, "ecg_match.csv"))
    except FileNotFoundError:
        raise ValueError(f"Required CSV files not found in {LOGS}")

    mort_with_Age_Death_ecg = mort_with_Age_Death.merge(ecg_match,on = ['SUBJECT_ID'],how = "left")

    for col in ['T0', 'T1']:
        mort_with_Age_Death_ecg[col] = pd.to_datetime(mort_with_Age_Death_ecg[col], errors='coerce')
    if mort_with_Age_Death_ecg['T0'].isna().any() or mort_with_Age_Death_ecg['T1'].isna().any():
        n0 = mort_with_Age_Death_ecg['T0'].isna().sum()
        n1 = mort_with_Age_Death_ecg['T1'].isna().sum()
        print(f"Warning: T0 缺漏 {n0} 筆, T1 缺漏 {n1} 筆 → 已刪除這些筆資料")
        mort_with_Age_Death_ecg = mort_with_Age_Death_ecg.dropna(subset=['T0','T1']).reset_index(drop=True)
    
    # 構建篩選條件
    # 1. 年齡 <= 125
    mort_with_Age_Death_ecg['AGE_YEARS'] = (
    mort_with_Age_Death_ecg['AGE_AT_ADMISSION_YYMMDDHHMMSS']
        .astype(str).str.zfill(14).str[:4]        # 取前 4 位：0000‒9999 年
        .astype(int)                              # 轉成整數
    )
    mask_age = mort_with_Age_Death_ecg['AGE_YEARS'] <= 125

    # 2 ECG 在死亡前(T1 <= DEATHTIME)
    mort_with_Age_Death_ecg['DEATHTIME'] = pd.to_datetime(mort_with_Age_Death_ecg['DEATHTIME'], errors='coerce')
    mask_post = (mort_with_Age_Death_ecg['T1'] <= mort_with_Age_Death_ecg['DEATHTIME']) 

    # 3 LEAD2_SEC >= 36000
    mask_lead = mort_with_Age_Death_ecg['LEAD2_SEC'] >= 36000

    # 保留所有判斷結果方便複查
    mort_with_Age_Death_ecg['AGE_UNDER_125'] = mask_age
    mort_with_Age_Death_ecg['WIOUT_POST_ECG'] = mask_post
    mort_with_Age_Death_ecg['ENOUGH_LEAD2'] = mask_lead
    # 最終篩選
    mort_stage2_filtered = mort_with_Age_Death_ecg.loc[mask_age & mask_post & mask_lead].reset_index(drop=True)

    mort_stage2_filtered.to_csv(os.path.join(LOGS,"mort_stage2_filtered.csv"),index = False)
    mort_with_Age_Death_ecg.to_csv(os.path.join(LOGS,"mort_stage2_detailed.csv"),index = False)

    return mort_stage2_filtered

mort_stage2_filtered = filter_age_l2Time_postECG(mort_with_Age_Death)

print(f"=========== Stage II Deceased Cohort Result =========== ")
print(f'Total Data: {len(mort_stage2_filtered)}')
print(f'Total SUBJECT ID: {mort_stage2_filtered["SUBJECT_ID"].nunique()}')


Total Data: 242
Total SUBJECT ID: 242


## Survivor Cohort Exclusion Criteria

- Age > 125 year
    - age計算 : PATIENTS.dob + ADMISSION.admittime(入院時間)
- no Continuous Lead II ECG >= 10 hr before discharge
    - Total Signal Length >= 36000 sec

In [20]:
# Exception Analsyis : Duplicated SUBJECT ID 93432
print(f"Total data: {len(surv['SUBJECT_ID'])}")
print(f"total ID: {surv['SUBJECT_ID'].nunique()}")
print(surv['SUBJECT_ID'].value_counts()[0:3])
"""
經分析剛好兩筆時間資料接近資料重疊，屬於篩選條件的edge case，只有最後一筆為正確資料
處理方式: 主動刪除 subject id : 93432, hadm id : 190234 , icustay_id: 204538 的該筆資料
"""

# surv_remove_edgeCase = surv.drop(surv[surv["ICUSTAY_ID"] == 204538].index).reset_index(drop=True)
# print(f"After Remove Total data: {len(surv_remove_edgeCase['SUBJECT_ID'])}")
# print(f"After Remove total ID: {surv_remove_edgeCase['SUBJECT_ID'].nunique()}")
# print(surv_remove_edgeCase.info())




Total data: 1775
total ID: 1775
SUBJECT_ID
98769    1
85       1
214      1
Name: count, dtype: int64


'\n經分析剛好兩筆時間資料接近資料重疊，屬於篩選條件的edge case，只有最後一筆為正確資料\n處理方式: 主動刪除 subject id : 93432, hadm id : 190234 , icustay_id: 204538 的該筆資料\n'

In [None]:
def merge_age_by_admit_surv(surv_remove_edgeCase: pd.DataFrame) -> pd.DataFrame:
    """
    將 Stage 1 surv Cohort 引入 patient.dob(病患出生日期)，並計算入院時年紀
    Args:
    - surv_remove_edgeCase : pd.DataFrame, Stage 1 Surv Cohort 移除所有重複 subject id 結果

    Return:
    - surv_with_Age: pd.DataFrame, 新增欄位 ["DOB","AGE_AT_ADMISSION_YYMMDDHHMMSS"]
    """
    try:
        pat = pd.read_csv(os.path.join(DATA_CSV, "PATIENTS.csv"),
                          usecols=['SUBJECT_ID', 'DOB'])

    except FileNotFoundError:
        raise ValueError(f"Required CSV files not found in {DATA_CSV}")
    
    # 2. 轉成 pandas 的 datetime64（先做 basic 清理，coerce 會把壞格式設為 NaT）
    pat['DOB']        = pd.to_datetime(pat['DOB'], errors='coerce')
    surv_remove_edgeCase['ADMITTIME']  = pd.to_datetime(surv_remove_edgeCase['ADMITTIME'], errors='coerce')
    # mask_bad = surv_remove_edgeCase['ADMITTIME'].isna()
    # print("被轉 NaT 的行數:", mask_bad.sum())
    # print(surv_remove_edgeCase.loc[mask_bad, ['SUBJECT_ID', 'ADMITTIME']].head())

    
    # 3. 合併資料（surv_remove_edgeCase為基準，用 left join）
    surv_with_Age = (
        surv_remove_edgeCase
        .merge(pat, on="SUBJECT_ID", how="left")                # PATIENTS 仍僅用 SUBJECT_ID
    )
    print(f'After Merge Total Data: {len(surv_with_Age)}')
    print(f'After merge SUBJECT ID: {surv_with_Age["SUBJECT_ID"].nunique()}')
    
    # 4. 定義逐列計算 age 的函式
    def calc_age_ymdhms(row):
        dob   = row['DOB']
        admit = row['ADMITTIME']
        # 如果有任一是 NaT，就回傳 NaN
        if pd.isna(dob) or pd.isna(admit):
            return np.nan
        # 用 python datetime 計算差分
        rd = relativedelta(admit.to_pydatetime(), dob.to_pydatetime())
        # 格式化：YYYYMMDDHHMMSS
        return (
            f"{rd.years:04d}"
            f"{rd.months:02d}"
            f"{rd.days:02d}"
            f"{rd.hours:02d}"
            f"{rd.minutes:02d}"
            f"{rd.seconds:02d}"
        )
    
    # 5. 套用到整張表
    surv_with_Age['AGE_AT_ADMISSION_YYMMDDHHMMSS'] = surv_with_Age.apply(calc_age_ymdhms, axis=1)

    surv_with_Age.to_csv(os.path.join(LOGS,"test.csv"),index= False)
    print(f'After Saving Total Data: {len(surv_with_Age)}')
    print(f'After Saving SUBJECT ID: {surv_with_Age["SUBJECT_ID"].nunique()}')
    print(f'最終缺失值檢測: {surv_with_Age.info()}')
    return surv_with_Age

surv_with_Age = merge_age_by_admit_surv(surv)


After Merge Total Data: 1775
After merge SUBJECT ID: 1775
After Saving Total Data: 1775
After Saving SUBJECT ID: 1775
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1775 entries, 0 to 1774
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   SUBJECT_ID                     1775 non-null   int64         
 1   HADM_ID                        1775 non-null   int64         
 2   ADMITTIME                      1775 non-null   datetime64[ns]
 3   DISCHTIME                      1775 non-null   object        
 4   DEATHTIME                      0 non-null      object        
 5   HOSPITAL_EXPIRE_FLAG           1775 non-null   int64         
 6   ICUSTAY_ID                     1775 non-null   int64         
 7   INTIME                         1775 non-null   object        
 8   OUTTIME                        1775 non-null   object        
 9   ICD9_CODE                      17

In [23]:
def filter_age_l2Time(surv_with_Age:pd.DataFrame)->pd.DataFrame:
    """
    合併 ecg_match.csv (需要包含[T0,T1,LEAD2_SEC]等欄位)，並依照指定條件做篩選
    Exclusion Criteria
    - Age > 125 year
    - LEAD2_SEC < 36000 sec
    """
    try:
        ecg_match = pd.read_csv(os.path.join(LOGS, "ecg_match.csv"))
    except FileNotFoundError:
        raise ValueError(f"Required CSV files not found in {LOGS}")

    surv_with_Age_ecg = surv_with_Age.merge(ecg_match,on = ['SUBJECT_ID'],how = "left")

    for col in ['T0', 'T1']:
        surv_with_Age_ecg[col] = pd.to_datetime(surv_with_Age_ecg[col], errors='coerce')
    if surv_with_Age_ecg['T0'].isna().any() or surv_with_Age_ecg['T1'].isna().any():
        n0 = surv_with_Age_ecg['T0'].isna().sum()
        n1 = surv_with_Age_ecg['T1'].isna().sum()
        print(f"Warning: T0 缺漏 {n0} 筆, T1 缺漏 {n1} 筆 → 已刪除這些筆資料")
        surv_with_Age_ecg = surv_with_Age_ecg.dropna(subset=['T0','T1']).reset_index(drop=True)
    
    # 構建篩選條件
    # 1. 年齡 <= 125
    surv_with_Age_ecg['AGE_YEARS'] = (
    surv_with_Age_ecg['AGE_AT_ADMISSION_YYMMDDHHMMSS']
        .astype(str).str.zfill(14).str[:4]        # 取前 4 位：0000‒9999 年
        .astype(int)                              # 轉成整數
    )
    mask_age = surv_with_Age_ecg['AGE_YEARS'] <= 125


    # 3 LEAD2_SEC >= 36000
    mask_lead = surv_with_Age_ecg['LEAD2_SEC'] >= 36000

    # 保留所有判斷結果方便複查
    surv_with_Age_ecg['AGE_UNDER_125'] = mask_age
    surv_with_Age_ecg['ENOUGH_LEAD2'] = mask_lead
    # 最終篩選
    surv_stage2_filtered = surv_with_Age_ecg.loc[mask_age & mask_lead].reset_index(drop=True)

    surv_stage2_filtered.to_csv(os.path.join(LOGS,"surv_stage2_filtered.csv"),index = False)
    surv_with_Age_ecg.to_csv(os.path.join(LOGS,"surv_stage2_detailed.csv"),index = False)

    return surv_stage2_filtered

surv_stage2_filtered = filter_age_l2Time(surv_with_Age)

print(f"=========== Stage II S Cohort Result =========== ")
print(f'Total Data: {len(surv_stage2_filtered)}')
print(f'Total SUBJECT ID: {surv_stage2_filtered["SUBJECT_ID"].nunique()}')


Total Data: 1448
Total SUBJECT ID: 1448


In [32]:
# Cross Validation
alive = pd.read_csv("./experiment_data_from_yuran/alive_42731_withHRV.csv")
alive_set = set(alive['SUBJECT_ID'].to_list())
surv_stage2_set = set(surv_stage2_filtered['SUBJECT_ID'])
print(alive_set-surv_stage2_set)
print(len(alive_set-surv_stage2_set))
print(len(surv_stage2_set-alive_set))

# Check Surv Rule
# T1<ICU_OUT / DISCHTIME (取時間更早的)
outlier = surv_stage2_filtered[surv_stage2_filtered['SUBJECT_ID'].isin(alive_set & surv_stage2_set)]

# 檢查時間欄位：轉為 datetime（保險起見，避免格式問題）
for col in ['T0', 'T1', 'OUTTIME', 'DISCHTIME', 'INTIME', 'ADMITTIME']:
    outlier[col] = pd.to_datetime(outlier[col], errors='coerce')

# --- 規則 1：T1 < min(OUTTIME, DISCHTIME) ---
outlier['MIN_OUT'] = outlier[['OUTTIME', 'DISCHTIME']].min(axis=1)
outlier['RULE1_PASSED'] = outlier['T1'] < outlier['MIN_OUT']

# --- 規則 2：T0 > max(INTIME, ADMITTIME) ---
outlier['MAX_IN'] = outlier[['INTIME', 'ADMITTIME']].max(axis=1)
outlier['RULE2_PASSED'] = outlier['T0'] > outlier['MAX_IN']

# 篩出任一規則不通過的 row
violated = outlier[(~outlier['RULE1_PASSED']) | (~outlier['RULE2_PASSED'])]

# 取得違反規則的 subject_id（避免重複）
violated_subject_ids = violated['SUBJECT_ID'].unique()

print("以下 SUBJECT_ID 未通過時間規則檢查：")
print(len(violated_subject_ids))






{17667, 94853, 84874, 52746, 42510, 44437, 54935, 19866, 82843, 29730, 45608, 78892, 91181, 85551, 5685, 47677, 86209, 94785, 83782, 26055, 92235, 12365, 82000, 62035, 77524, 88921, 30170, 5727, 60641, 67429, 80106, 53102, 41976, 40569, 63486}
35
1117
以下 SUBJECT_ID 未通過時間規則檢查：
34


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outlier[col] = pd.to_datetime(outlier[col], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outlier[col] = pd.to_datetime(outlier[col], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outlier[col] = pd.to_datetime(outlier[col], errors='coerce')
A value is trying to b

In [25]:
import pandas as pd
alive = pd.read_csv("./experiment_data_from_yuran/alive_42731_withHRV.csv")
alive_set = set(alive['SUBJECT_ID'].to_list())
surv_stage2 = pd.read_csv(".\logs\surv_stage2_detailed.csv")

lead2Time = surv_stage2[surv_stage2['SUBJECT_ID'].isin(alive_set)]
lead2Time_fail = lead2Time.loc[lead2Time['AGE_UNDER_125'] & ~lead2Time['ENOUGH_LEAD2']]
print(lead2Time_fail['SUBJECT_ID'].value_counts())

Series([], Name: count, dtype: int64)


In [26]:
base = pd.read_csv(r".\logs\base.csv")
dead = pd.read_csv(r"./experiment_data_from_yuran/dead_42731_withHRV.csv")
dead_set = set(dead['SUBJECT_ID'].to_list())

base_set = set(base['SUBJECT_ID'].to_list())
print(f"total : {len(alive_set-base_set)} :\nDetail: {alive_set-base_set}")
print(f"total : {len(dead_set-base_set)} :\nDetail: {dead_set-base_set}")

total : 0 :
Detail: set()
total : 0 :
Detail: set()


In [None]:
from typing import Optional, Tuple
def parse_base_datetime(hdr):
    """
    處理 Header base time data type (適用於不同 wfdb 版本)
    Args:
    - hdr: wfdb.header 物件

    Return:
    - t0: 紀錄的起始時間, datetime 物件
    """
    if getattr(hdr, "base_datetime", None):           # v4.x 直接提供
        return hdr.base_datetime
    # base_date 可能已是 datetime.date
    bdate = hdr.base_date if isinstance(hdr.base_date, date) \
            else datetime.strptime(hdr.base_date, "%d/%m/%Y").date()
    # base_time 可能已是 datetime.time
    btime = hdr.base_time if isinstance(hdr.base_time, time) \
            else datetime.strptime(hdr.base_time.split(".")[0], "%H:%M:%S").time()
    return datetime.combine(bdate, btime)

def last_lead2_segment_end(hdr, rec_dir: str) -> Optional[Tuple[datetime, float]]:
    """
    計算每一個 含有 Lead II segment 的 ECG 時間長度(in sec)，並以最後一個含 Lead II 的 segment 的結束作為整段ECG訊號的結束(即 T1)

    Args:
    - hdr: wfdb.rdheader() 回傳的 MultiRecordHeader 物件
    - rec_dir: 該 header 檔案所在資料夾 (Path，不含 .hea)

    Return
    - (T1, total_lead2_sec): 
        T1 為最後一個 Lead II segment 的結束 datetime
        total_lead2_sec 為所有 segment 中包含 Lead II 的總秒數
      若完全沒有 Lead II，回傳 None。
    """
    base_dt = parse_base_datetime(hdr)
    if base_dt is None:
        return None

    # 1) 取段名與長度（nsamp） —— 來源 hdr.seg_name / hdr.seg_len
    seg_names = hdr.seg_name           # list[str]
    seg_lens  = hdr.seg_len            # list[int]  :contentReference[oaicite:3]{index=3}
    assert len(seg_names) == len(seg_lens)

    # 2) 預先算 cumulative start index
    cum = [0]
    for L in seg_lens[:-1]:
        cum.append(cum[-1] + L)

    total_sec = 0.0
    last_end  = None

    # 3) 倒序掃描段
    for seg_name, start_idx, seg_len in reversed(list(zip(seg_names, cum, seg_lens))):
        if seg_name == "~":            # gap 段 :contentReference[oaicite:4]{index=4}
            continue
        sub_hdr = wfdb.rdheader(Path(rec_dir, seg_name))
        if "II" not in [n.upper().replace(" ", "") for n in sub_hdr.sig_name]:
            continue

        print(f"file : {seg_name} : sigal length : {seg_len} , fs: {sub_hdr.fs}")

        seg_sec = seg_len / sub_hdr.fs
        total_sec += seg_sec
        
        if last_end is None: #找到最後一段，先行紀錄
            seg_start_abs = base_dt + timedelta(seconds=start_idx / hdr.fs)
            last_end = seg_start_abs + timedelta(seconds=seg_sec)


    if last_end is None:
        return None
    return last_end, total_sec

def header_time_range(header_path:str)->Optional[Tuple[datetime,datetime,float]]:
    """
    計算 ECG訊號的起始時間，關於時間算法的界定如下:
    - T0: ECG 訊號起始時間，這裡採用 `header.base_dt`: 整個 multi-segment record 的起始時間，而不是某個 segment 的起始時間
        - 更精確做法應該採用 seg 內段落(待考慮，予以保留)
    - T1: ECG 訊號結束時間
        - 採用最後一筆 LEAD II seg 作為ECG訊號截止時間
        - 計算方式: 
            - seg_end = seg_start + (seg_length_in_seconds)
            - seg_length_in_seconds = sub_hdr.sig_len / sub_hdr.fs
    Arg:
    - header_path: WFDB header 的完整path : Z:/p00/p000085/p000085-2167-07-25-21-11.hea
    """
    rec_path   = Path(header_path).with_suffix("")    # 去掉 .hea
    hdr        = wfdb.rdheader(str(rec_path), rd_segments=True)
    if hdr.base_time is None or hdr.base_date is None:
        print(f"{rec_path.name}: missing base_date/time")
        return None

    t0 = parse_base_datetime(hdr)
    res = last_lead2_segment_end(hdr, rec_path.parent)
    if res is None:
        print(f"{rec_path.name}: 不存在 Lead II")
        return None

    t1, l2_sec = res
    return t0, t1, l2_sec

print(header_time_range("z:\p09\p095396\p095396-2142-11-06-12-12.hea"))