In [5]:
import os
import csv
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier


In [3]:
X_trains = []
y_trains = []
dataset_names = sorted(
    [name for name in os.listdir("./Competition_data") if not name.startswith('.')],
    key=lambda x: int(x.split('_')[1]) if x.startswith("Dataset_") else float('inf'))

for folder_name in dataset_names:
    X_train = pd.read_csv(f"./Competition_data/{folder_name}/X_train.csv")
    y_train = pd.read_csv(f"./Competition_data/{folder_name}/y_train.csv")
    X_trains.append(X_train)
    y_trains.append(y_train)

# 找到所有資料集中的獨特特徵名稱，並排序
all_features = set()
for X_train in X_trains:
    all_features.update(X_train.columns)
all_features = sorted(list(all_features))  # 確保特徵名稱排序一致性

# 計算每個特徵的均值來填充缺失值
feature_means = pd.concat([X_train.reindex(columns=all_features) for X_train in X_trains], axis=0).mean()

# 將每個資料集的特徵對齊，並使用均值填充缺失值
aligned_X_trains = [X_train.reindex(columns=all_features).fillna(feature_means) for X_train in X_trains]

# 合併所有訓練資料
X_train_combined = pd.concat(aligned_X_trains, axis=0)
y_train_combined = pd.concat(y_trains, axis=0).values.ravel()

In [None]:
#rf模型建構
rf_model = RandomForestClassifier(
    n_estimators=3000, 
    max_depth=None, 
    random_state=42,
    class_weight='balanced')
rf_model.fit(X_train_combined, y_train_combined)

In [81]:
for folder_name in dataset_names:
    X_test = pd.read_csv(f"./Competition_data/{folder_name}/X_test.csv")
    # 對齊並使用均值填充
    X_test_aligned = X_test.reindex(columns=all_features).fillna(feature_means)
    
    # 獲取機率預測
    y_pred_proba = rf_model.predict_proba(X_test_aligned)[:, 1]
    
    # 儲存機率預測
    predictions = pd.DataFrame({'y_predict': y_pred_proba})
    predictions.to_csv(f'./Competition_data/{folder_name}/y_predict.csv', index=False)
    
    print(f"Predictions for {folder_name} saved to {folder_name}/y_predict.csv")

Predictions for Dataset_1 saved to Dataset_1/y_predict.csv
Predictions for Dataset_2 saved to Dataset_2/y_predict.csv
Predictions for Dataset_3 saved to Dataset_3/y_predict.csv
Predictions for Dataset_4 saved to Dataset_4/y_predict.csv
Predictions for Dataset_5 saved to Dataset_5/y_predict.csv
Predictions for Dataset_6 saved to Dataset_6/y_predict.csv
Predictions for Dataset_7 saved to Dataset_7/y_predict.csv
Predictions for Dataset_8 saved to Dataset_8/y_predict.csv
Predictions for Dataset_9 saved to Dataset_9/y_predict.csv
Predictions for Dataset_10 saved to Dataset_10/y_predict.csv
Predictions for Dataset_11 saved to Dataset_11/y_predict.csv
Predictions for Dataset_12 saved to Dataset_12/y_predict.csv
Predictions for Dataset_13 saved to Dataset_13/y_predict.csv
Predictions for Dataset_14 saved to Dataset_14/y_predict.csv
Predictions for Dataset_15 saved to Dataset_15/y_predict.csv
Predictions for Dataset_16 saved to Dataset_16/y_predict.csv
Predictions for Dataset_17 saved to Datase

AUC:0.812619