# 特徵工程 (Feature Engineering)
## 第三期大腸癌存活預測研究

本筆記本進行特徵創建與選擇

In [None]:
# 導入套件
import pandas as pd
import numpy as np
import sys
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# 設定路徑
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# 導入自訂模組
from src.feature_engineering import SurvivalFeatureEngineer
from src.utils import load_config

print("套件載入完成")

In [None]:
# 載入配置
config = load_config(str(project_root / 'config' / 'config.yaml'))

# 載入處理後的資料
train_df = pd.read_csv(project_root / 'data' / 'processed' / 'train_data.csv')
test_df = pd.read_csv(project_root / 'data' / 'processed' / 'test_data.csv')

print(f"訓練集形狀: {train_df.shape}")
print(f"測試集形狀: {test_df.shape}")

In [None]:
# 初始化特徵工程器
engineer = SurvivalFeatureEngineer()

# 應用所有特徵工程
train_engineered = engineer.apply_all_features(train_df.copy())
test_engineered = engineer.apply_all_features(test_df.copy())

print(f"\n特徵工程後訓練集形狀: {train_engineered.shape}")
print(f"特徵工程後測試集形狀: {test_engineered.shape}")

In [None]:
# 查看新特徵
new_features = [col for col in train_engineered.columns if col not in train_df.columns]
print(f"新增特徵: {new_features}")

if len(new_features) > 0:
    print("\n新特徵統計:")
    print(train_engineered[new_features].describe())

In [None]:
# 視覺化新特徵分佈
if len(new_features) > 0:
    fig, axes = plt.subplots(1, len(new_features), figsize=(5*len(new_features), 4))
    
    if len(new_features) == 1:
        axes = [axes]
    
    for idx, feature in enumerate(new_features):
        train_engineered[feature].hist(bins=30, ax=axes[idx], edgecolor='black')
        axes[idx].set_title(f'{feature} 分佈')
        axes[idx].set_xlabel(feature)
        axes[idx].set_ylabel('頻率')
    
    plt.tight_layout()
    plt.show()

In [None]:
# 特徵選擇 (如果有目標變數)
if 'event' in train_engineered.columns:
    # 準備特徵和目標
    target_cols = ['survival_time', 'event']
    feature_cols = [col for col in train_engineered.columns if col not in target_cols]
    
    X_train = train_engineered[feature_cols]
    y_train = train_engineered['event']
    
    # 選擇前 K 個特徵
    k = min(10, len(feature_cols))
    X_selected = engineer.select_features(X_train, y_train, k=k, method='mutual_info')
    
    print(f"\n選擇後的特徵數量: {X_selected.shape[1]}")

In [None]:
# 儲存特徵工程後的資料
output_train_path = project_root / 'data' / 'processed' / 'train_features.csv'
output_test_path = project_root / 'data' / 'processed' / 'test_features.csv'

train_engineered.to_csv(output_train_path, index=False)
test_engineered.to_csv(output_test_path, index=False)

print(f"\n訓練特徵已儲存至: {output_train_path}")
print(f"測試特徵已儲存至: {output_test_path}")
print("\n特徵工程完成！")