In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_parquet('/Users/huangshifeng/Desktop/stage_III_colon_surv/data/train.parquet', engine='pyarrow')
df.info()
df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
Index: 253 entries, 0 to 300
Data columns (total 43 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Patient_ID            253 non-null    int64         
 1   Dx_Year               253 non-null    int64         
 2   Age                   253 non-null    int64         
 3   Sex                   253 non-null    int64         
 4   BMI                   249 non-null    float64       
 5   ECOG                  239 non-null    Int8          
 6   Tumor_Location        253 non-null    int64         
 7   Tumor_Location_Group  253 non-null    int64         
 8   pT_Stage              253 non-null    category      
 9   pN_Stage              253 non-null    category      
 10  AJCC_Substage         253 non-null    category      
 11  LN_Total              253 non-null    int64         
 12  LN_Positive           253 non-null    int64         
 13  LNR                   253

Patient_ID                0
Dx_Year                   0
Age                       0
Sex                       0
BMI                       4
ECOG                     14
Tumor_Location            0
Tumor_Location_Group      0
pT_Stage                  0
pN_Stage                  0
AJCC_Substage             0
LN_Total                  0
LN_Positive               0
LNR                       0
Histology                 0
Differentiation           1
LVI                       4
PNI                       3
Tumor_Deposits            0
Mucinous_Gt_50            0
Mucinous_Any              0
Signet_Ring               0
MSI_Status                2
Tumor_Size_cm             2
CEA_PreOp                43
Log_CEA_PreOp            43
Radical_Op_Date           0
Op_Procedure              0
PreOp_Albumin            39
Last_FU_Date              0
Recurrence                0
Recurrence_Date         179
Recurrence_Type         179
Death                     0
Death_Cause               0
DFS_Months          

In [3]:
#把沒有用的欄位都去掉
df_core = df.drop(columns=['Patient_ID', 'Dx_Year', 'Tumor_Location', 'LN_Total', 'LN_Positive', 'Histology', 'Signet_Ring', 'Radical_Op_Date', 'Op_Procedure', 'Last_FU_Date', 'Recurrence', 'Recurrence_Date', 'Recurrence_Type', 'Death_Cause', 'Visiting_Staff', 'time_to_recurrence', 'Follow_Up_Time']) 
df_core.head()
df_core.isna().sum()

Age                      0
Sex                      0
BMI                      4
ECOG                    14
Tumor_Location_Group     0
pT_Stage                 0
pN_Stage                 0
AJCC_Substage            0
LNR                      0
Differentiation          1
LVI                      4
PNI                      3
Tumor_Deposits           0
Mucinous_Gt_50           0
Mucinous_Any             0
MSI_Status               2
Tumor_Size_cm            2
CEA_PreOp               43
Log_CEA_PreOp           43
PreOp_Albumin           39
Death                    0
DFS_Months               0
OS_Months                0
Op_Year                  0
edr_18m                  0
edr_24m                  0
dtype: int64

In [4]:
#處理特徵變項欄位coding
df_core['Tumor_Location'] = df_core['Tumor_Location_Group'].rename('Tumor_Location')
ordinal_cols = ['ECOG', 'pT_Stage', 'pN_Stage', 'AJCC_Substage', 'Differentiation']
nominal_cols = []
binary_cols= ['Sex', 'LVI', 'PNI', 'Tumor_Deposits', 'Mucinous_Gt_50', 'Mucinous_Any', 'Tumor_Location']
continuous_cols = ['Age','LNR','CEA_PreOp','Log_CEA_PreOp','BMI','Tumor_Size_cm','PreOp_Albumin']

#MSI_Status要處理
df_core["MSI_High"] = (
    df_core["MSI_Status"]
      .astype("string").str.strip()           # 解開 category，處理空白
      .map({"MSI-H": True, "MSS": False})     # 其餘值 → <NA>
      .astype("boolean")                      # 可空布林：True/False/<NA>
      .astype("Int8")                         # 變 1/0/<NA>
)
df_core['MSI_High'].value_counts()
binary_cols = binary_cols + ["MSI_High"]

In [5]:
print("\n【MISSING VALUES CHECK 缺失值檢查】")

missing = df_core.isnull().sum()
missing_pct = (missing / len(df_core) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing_Count': missing,
    'Missing_Pct': missing_pct
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Pct', ascending=False)

if len(missing_df) > 0:
    print(f"⚠️ Found {len(missing_df)} columns with missing values:")
    print(missing_df.head(20).to_string())
    
    # 高缺失率變數
    high_missing = missing_df[missing_df['Missing_Pct'] > 20]
    if len(high_missing) > 0:
        print(f"\n⚠️ HIGH MISSING (>20%):")
        print(high_missing.to_string())
else:
    print("✓ No missing values found!")


【MISSING VALUES CHECK 缺失值檢查】
⚠️ Found 11 columns with missing values:
                 Missing_Count  Missing_Pct
CEA_PreOp                   43        17.00
Log_CEA_PreOp               43        17.00
PreOp_Albumin               39        15.42
ECOG                        14         5.53
BMI                          4         1.58
LVI                          4         1.58
PNI                          3         1.19
MSI_Status                   2         0.79
Tumor_Size_cm                2         0.79
MSI_High                     2         0.79
Differentiation              1         0.40


In [6]:
#對ordinal_cols編碼
def encode_ordinal_features(df):
    """
    對ordinal變數做編碼
    """
    # 1. ECOG
    if 'ECOG' in df.columns:
        # 應該已經是0,1,2,3,4的數字
        df['ECOG_ordinal'] = df['ECOG'].astype(float)
    
    # 2. pT_Stage
    if 'pT_Stage' in df.columns:
        pt_map = {
            '1': 1, '1A': 1, '1B': 1,
            '2': 2,
            '3': 3,
            '4': 4, '4A': 4, '4B': 4
        }
        df['pT_ordinal'] = df['pT_Stage'].map(pt_map)
    
    # 3. pN_Stage
    if 'pN_Stage' in df.columns:
        pn_map = {
            '0': 0,
            '1': 1, '1A': 1, '1B': 1, '1C': 1,
            '2': 2, '2A': 2, '2B': 2
        }
        df['pN_ordinal'] = df['pN_Stage'].map(pn_map)
    
    # 4. AJCC_Substage
    if 'AJCC_Substage' in df.columns:
        ajcc_map = {
            '3A': 1,
            '3B': 2,
            '3C': 3
        }
        df['AJCC_ordinal'] = df['AJCC_Substage'].map(ajcc_map)
    
    # 5. Differentiation
    if 'Differentiation' in df.columns:
        # 已經是1,2,3，直接用
        df['Differentiation_ordinal'] = df['Differentiation']
    
    return df

df_core = encode_ordinal_features(df_core)
df_core.dtypes

Age                           int64
Sex                           int64
BMI                         float64
ECOG                           Int8
Tumor_Location_Group          int64
pT_Stage                   category
pN_Stage                   category
AJCC_Substage              category
LNR                         float64
Differentiation                Int8
LVI                         float64
PNI                         float64
Tumor_Deposits                int64
Mucinous_Gt_50                int64
Mucinous_Any                  int64
MSI_Status                 category
Tumor_Size_cm               float64
CEA_PreOp                   float64
Log_CEA_PreOp               float64
PreOp_Albumin               float64
Death                         int64
DFS_Months                  float64
OS_Months                   float64
Op_Year                       int32
edr_18m                        Int8
edr_24m                        Int8
Tumor_Location                int64
MSI_High                    

In [7]:
#處理Binary cols
def check_and_fix_binary(df, binary_cols):
    """
    確保binary變數是0/1
    """    
    for col in binary_cols:
        if col not in df.columns:
            continue
            
        # 檢查unique values
        unique_vals = df[col].dropna().unique()
        print(f"{col}: {unique_vals}")

        sex_map = {1:1, 2:0, "1":1, "2":0}
        location_map = {1:1, 2:0, "1":1, "2":0}        
        if col == "Sex":
            df[col]=df[col].map(sex_map)
        elif col == "Tumor_Location":
            df[col]=df[col].map(location_map)
        else:
            continue
        
        # 驗證只有0和1
        unique_after = df[col].unique()
        if not set(unique_after).issubset({0, 1}):
            print(f"  ⚠️ Warning: {col} has values other than 0/1: {unique_after}")
    
    return df

df_core = check_and_fix_binary(df_core, binary_cols)

df_core[binary_cols].dtypes

Sex: [2 1]
LVI: [1. 0.]
PNI: [1. 0.]
Tumor_Deposits: [0 1]
Mucinous_Gt_50: [1 0]
Mucinous_Any: [1 0]
Tumor_Location: [1 2]
MSI_High: <IntegerArray>
[0, 1]
Length: 2, dtype: Int8


Sex                 int64
LVI               float64
PNI               float64
Tumor_Deposits      int64
Mucinous_Gt_50      int64
Mucinous_Any        int64
Tumor_Location      int64
MSI_High             Int8
dtype: object

In [8]:
def final_check(df, feature_list):
    """
    最終檢查所有特徵是否ready for ML
    """
    print("=== FINAL ML-READINESS CHECK ===\n")
    
    for feat in feature_list:
        if feat not in df.columns:
            print(f"✗ {feat}: NOT FOUND")
            continue
        
        dtype = df[feat].dtype
        nunique = df[feat].nunique()
        missing = df[feat].isna().sum()
        missing_pct = missing / len(df) * 100
        
        print(f"✓ {feat}")
        print(f"    dtype: {dtype}")
        print(f"    unique: {nunique}")
        print(f"    missing: {missing} ({missing_pct:.1f}%)")
        
        # 檢查值的範圍
        if dtype in ['int64', 'float64', 'Int64']:
            values = df[feat].dropna()
            if len(values) > 0:
                print(f"    range: [{values.min():.2f}, {values.max():.2f}]")
        else:
            print(f"    sample values: {df[feat].unique()[:5]}")
        
        print()

# 使用
numeric_features = continuous_cols
categorical_features =  ['ECOG_ordinal', 'pT_ordinal', 'pN_ordinal', 'AJCC_ordinal', 'Differentiation_ordinal'] + binary_cols
feature_list =  numeric_features + categorical_features
final_check(df_core, feature_list)

=== FINAL ML-READINESS CHECK ===

✓ Age
    dtype: int64
    unique: 56
    missing: 0 (0.0%)
    range: [32.00, 98.00]

✓ LNR
    dtype: float64
    unique: 116
    missing: 0 (0.0%)
    range: [0.00, 0.92]

✓ CEA_PreOp
    dtype: float64
    unique: 101
    missing: 43 (17.0%)
    range: [0.50, 914.20]

✓ Log_CEA_PreOp
    dtype: float64
    unique: 97
    missing: 43 (17.0%)
    range: [0.41, 6.82]

✓ BMI
    dtype: float64
    unique: 131
    missing: 4 (1.6%)
    range: [13.80, 39.00]

✓ Tumor_Size_cm
    dtype: float64
    unique: 72
    missing: 2 (0.8%)
    range: [0.10, 12.00]

✓ PreOp_Albumin
    dtype: float64
    unique: 29
    missing: 39 (15.4%)
    range: [2.10, 4.90]

✓ ECOG_ordinal
    dtype: float64
    unique: 4
    missing: 14 (5.5%)
    range: [0.00, 3.00]

✓ pT_ordinal
    dtype: int64
    unique: 4
    missing: 0 (0.0%)
    range: [1.00, 4.00]

✓ pN_ordinal
    dtype: int64
    unique: 2
    missing: 0 (0.0%)
    range: [1.00, 2.00]

✓ AJCC_ordinal
    dtype: cat

In [9]:
#修正驗證發現的問題

#修正BMI的0值
df_core.loc[df_core['BMI'] == 0, 'BMI'] = np.nan

#修正AJCC的dtype
df_core['AJCC_ordinal'] = df_core['AJCC_ordinal'].astype(float)

#再檢查一次
final_check(df_core, feature_list)

=== FINAL ML-READINESS CHECK ===

✓ Age
    dtype: int64
    unique: 56
    missing: 0 (0.0%)
    range: [32.00, 98.00]

✓ LNR
    dtype: float64
    unique: 116
    missing: 0 (0.0%)
    range: [0.00, 0.92]

✓ CEA_PreOp
    dtype: float64
    unique: 101
    missing: 43 (17.0%)
    range: [0.50, 914.20]

✓ Log_CEA_PreOp
    dtype: float64
    unique: 97
    missing: 43 (17.0%)
    range: [0.41, 6.82]

✓ BMI
    dtype: float64
    unique: 131
    missing: 4 (1.6%)
    range: [13.80, 39.00]

✓ Tumor_Size_cm
    dtype: float64
    unique: 72
    missing: 2 (0.8%)
    range: [0.10, 12.00]

✓ PreOp_Albumin
    dtype: float64
    unique: 29
    missing: 39 (15.4%)
    range: [2.10, 4.90]

✓ ECOG_ordinal
    dtype: float64
    unique: 4
    missing: 14 (5.5%)
    range: [0.00, 3.00]

✓ pT_ordinal
    dtype: int64
    unique: 4
    missing: 0 (0.0%)
    range: [1.00, 4.00]

✓ pN_ordinal
    dtype: int64
    unique: 2
    missing: 0 (0.0%)
    range: [1.00, 2.00]

✓ AJCC_ordinal
    dtype: flo

In [10]:
df_core.to_parquet('/Users/huangshifeng/Desktop/stage_III_colon_surv/data/train_preprocessed.parquet')