In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

In [None]:
# 準備訓練和測試數據集名稱並排序
dataset_names = sorted(
    [name for name in os.listdir("./Competition_data") if not name.startswith('.')],
    key=lambda x: int(x.split('_')[1]) if x.startswith("Dataset_") else float('inf')
)

# 加載所有訓練資料
X_trains, y_trains = [], []
for folder_name in dataset_names:
    X_train = pd.read_csv(f"./Competition_data/{folder_name}/X_train.csv")
    y_train = pd.read_csv(f"./Competition_data/{folder_name}/y_train.csv")
    X_trains.append(X_train)
    y_trains.append(y_train)

# 找到所有資料集中的獨特特徵名稱，並排序
all_features = set()
for X_train in X_trains:
    all_features.update(X_train.columns)
all_features = sorted(list(all_features))  # 確保特徵名稱排序一致性

# 計算每個特徵的均值來填充缺失值
feature_means = pd.concat([X_train.reindex(columns=all_features) for X_train in X_trains], axis=0).mean()

# 將每個資料集的特徵對齊，並使用均值填充缺失值
aligned_X_trains = [X_train.reindex(columns=all_features).fillna(feature_means) for X_train in X_trains]

# 合併所有訓練資料
X_train_combined = pd.concat(aligned_X_trains, axis=0)
y_train_combined = pd.concat(y_trains, axis=0).values.ravel()



In [5]:
# 設置基模型
base_models = [
    ('random_forest', RandomForestClassifier(n_estimators=2000, random_state=42)),
    ('xgboost', XGBClassifier(n_estimators=2000, max_depth=None, learning_rate=0.04, colsample_bytree=0.9,colsample_bylevel=0.9,
                        subsample=0.9, random_state=42, scale_pos_weight=len(y_train_combined[y_train_combined == 0]) / len(y_train_combined[y_train_combined == 1]))),
    ('lightgbm', LGBMClassifier(
    n_estimators=2000,
    learning_rate=0.05,
    colsample_bytree=0.9,
    subsample=0.9,
    random_state=42,
    is_unbalance=True  # 自動平衡類別，與 scale_pos_weight 作用相同
))
]

# 設置元模型
meta_model = LogisticRegression()

# 訓練基模型並生成折外預測
kf = KFold(n_splits=5, shuffle=True, random_state=42)
base_model_predictions = np.zeros((X_train_combined.shape[0], len(base_models)))

for i, (name, model) in enumerate(base_models):
    oof_preds = np.zeros(X_train_combined.shape[0])
    for train_idx, val_idx in kf.split(X_train_combined):
        model.fit(X_train_combined.iloc[train_idx], y_train_combined[train_idx])
        oof_preds[val_idx] = model.predict_proba(X_train_combined.iloc[val_idx])[:, 1]  # 機率預測
    base_model_predictions[:, i] = oof_preds

# 使用基模型的預測作為元模型的訓練資料
meta_model.fit(base_model_predictions, y_train_combined)
print("Stacking 模型訓練完成")

# 對每個測試集進行預測
for folder_name in dataset_names:
    X_test = pd.read_csv(f"./Competition_data/{folder_name}/X_test.csv")
    # 對齊並使用均值填充
    X_test_aligned = X_test.reindex(columns=all_features).fillna(feature_means)
    
    # 基模型預測
    test_preds = np.zeros((X_test_aligned.shape[0], len(base_models)))
    for i, (name, model) in enumerate(base_models):
        test_preds[:, i] = model.predict_proba(X_test_aligned)[:, 1]  # 機率預測

    # 使用元模型進行最終預測
    y_pred_proba = meta_model.predict_proba(test_preds)[:, 1]
    
    # 儲存機率預測
    predictions = pd.DataFrame({'y_predict': y_pred_proba})
    predictions.to_csv(f'./Competition_data/{folder_name}/y_predict.csv', index=False)
    
    print(f"Predictions for {folder_name} saved to {folder_name}/y_predict.csv")

[LightGBM] [Info] Number of positive: 4032, number of negative: 7756
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006937 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10356
[LightGBM] [Info] Number of data points in the train set: 11788, number of used features: 97
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.342043 -> initscore=-0.654204
[LightGBM] [Info] Start training from score -0.654204
[LightGBM] [Info] Number of positive: 4063, number of negative: 7725
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005018 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10358
[LightGBM] [Info] Number of data points in the train set: 11788, number of used features: 97
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.344673 -> initscore=-0.642540
[LightGBM] [Info] Start training from score -0.642540
[LightGBM] [In