# 資料預處理 (Data Preprocessing)
## 第三期大腸癌存活預測研究

本筆記本處理資料清理、轉換及準備工作

In [None]:
# 導入套件
import pandas as pd
import numpy as np
import sys
from pathlib import Path

# 設定路徑
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# 導入自訂模組
from src.data_preprocessing import SurvivalDataPreprocessor
from src.utils import load_config

print("套件載入完成")

In [None]:
# 載入配置
config = load_config(str(project_root / 'config' / 'config.yaml'))
print("配置載入完成")

In [None]:
# 初始化預處理器
preprocessor = SurvivalDataPreprocessor(config)

# 載入資料
df = preprocessor.load_data(str(project_root / 'raw_data.csv'))
df.head()

In [None]:
# 處理缺失值
df_clean = preprocessor.handle_missing_values(df, strategy='median')
print(f"\n清理後資料形狀: {df_clean.shape}")

In [None]:
# 識別類別變數和數值變數
categorical_cols = df_clean.select_dtypes(include=['object']).columns.tolist()
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()

print(f"類別變數: {categorical_cols}")
print(f"數值變數: {numeric_cols}")

In [None]:
# 編碼類別變數
if len(categorical_cols) > 0:
    df_encoded = preprocessor.encode_categorical_variables(df_clean, categorical_cols)
else:
    df_encoded = df_clean.copy()
    print("沒有類別變數需要編碼")

df_encoded.head()

In [None]:
# 標準化數值特徵 (保留目標變數不標準化)
target_cols = ['survival_time', 'event']  # 根據實際情況調整
feature_cols = [col for col in numeric_cols if col not in target_cols]

if len(feature_cols) > 0:
    df_normalized = df_encoded.copy()
    df_normalized = preprocessor.normalize_features(df_normalized, feature_cols)
else:
    df_normalized = df_encoded.copy()
    print("沒有需要標準化的特徵")

df_normalized.head()

In [None]:
# 分割訓練集與測試集
train_df, test_df = preprocessor.split_data(
    df_normalized,
    test_size=config['split']['test_size'],
    random_state=config['split']['random_state']
)

print(f"訓練集: {train_df.shape}")
print(f"測試集: {test_df.shape}")

In [None]:
# 儲存處理後的資料
train_path = project_root / 'data' / 'processed' / 'train_data.csv'
test_path = project_root / 'data' / 'processed' / 'test_data.csv'

preprocessor.save_processed_data(train_df, test_df, str(train_path), str(test_path))

print("\n資料預處理完成！")

In [None]:
# 檢查處理後的資料
print("訓練集統計:")
print(train_df.describe())

print("\n測試集統計:")
print(test_df.describe())