In [None]:
import os
import csv
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:

dataset_names=[]
X_trains=[]
y_trains=[]
X_tests=[]
base_path = Path(r"C:/Users/athen/Downloads/Competition_data/Competition_data")

for folder_name in os.listdir(base_path):
    dataset_names.append(folder_name)
    X_trains.append(pd.read_csv(base_path / folder_name / "X_train.csv", header=0))
    y_trains.append(pd.read_csv(base_path / folder_name / "y_train.csv", header=0))
    X_tests.append(pd.read_csv(base_path / folder_name / "X_test.csv", header=0))
## your code here
def check_data_quality(df, dataset_name):
    print(f"\n{dataset_name} 數據質量報告:")
    
    # 缺失值檢查
    missing = df.isnull().sum()
    if missing.any():
        print("缺失值統計:")
        print(missing[missing > 0])
    
    # 重複行檢查
    duplicates = df.duplicated().sum()
    if duplicates > 0:
        print(f"重複行數量: {duplicates}")
    
    # 基本統計信息
    print("\n數值型特徵基本統計:")
    print(df.describe())
    
    # 異常值檢查 (以IQR方法為例)
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outliers = df[(df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)]
        if len(outliers) > 0:
            print(f"\n{col} 列可能的異常值數量: {len(outliers)}")
def handle_missing_values(df):
    # 數值型特徵用中位數填充
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    for col in numeric_cols:
        df[col].fillna(df[col].median(), inplace=True)
    
    # 類別型特徵用眾數填充
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        df[col].fillna(df[col].mode()[0], inplace=True)
    
    return df

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

def scale_features(X_train, X_test):
    # 標準化
    scaler = StandardScaler()
    # 或使用 MinMaxScaler() 歸一化
    # 或使用 RobustScaler() 處理異常值
    
    numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
    X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
    X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])
    
    return X_train, X_test, scaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

def encode_features(X_train, X_test):
    # 標籤編碼
    le_encoders = {}
    # 獲取類別型特徵
    categorical_cols = X_train.select_dtypes(include=['object']).columns
    
    for col in categorical_cols:
        le = LabelEncoder()
        # 檢查是否有缺失值
        if X_train[col].isnull().any():
            X_train[col].fillna('Unknown', inplace=True)
        if X_test[col].isnull().any():
            X_test[col].fillna('Unknown', inplace=True)
            
        # 合併訓練和測試集的唯一值進行擬合
        unique_values = pd.concat([X_train[col], X_test[col]]).unique()
        le.fit(unique_values)
        
        # 轉換數據
        X_train[col] = le.transform(X_train[col])
        X_test[col] = le.transform(X_test[col])
        le_encoders[col] = le
    
    return X_train, X_test, le_encoders
def create_features(df):
    # 數值特徵間的交互
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    
    # 加法交互
    for i in range(len(numeric_cols)):
        for j in range(i+1, len(numeric_cols)):
            col1, col2 = numeric_cols[i], numeric_cols[j]
            df[f'{col1}_plus_{col2}'] = df[col1] + df[col2]
    
    # 乘法交互
    for i in range(len(numeric_cols)):
        for j in range(i+1, len(numeric_cols)):
            col1, col2 = numeric_cols[i], numeric_cols[j]
            df[f'{col1}_mult_{col2}'] = df[col1] * df[col2]
    
    # 統計特徵
    for col in numeric_cols:
        df[f'{col}_squared'] = df[col] ** 2
        df[f'{col}_cubed'] = df[col] ** 3
        df[f'{col}_log'] = np.log1p(df[col] - df[col].min() + 1)
    
    return df
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier

def select_features(X_train, y_train, X_test, method='statistical', n_features=10):
    if method == 'statistical':
        # 使用 SelectKBest 和 f_classif
        selector = SelectKBest(score_func=f_classif, k=n_features)
        X_train_selected = selector.fit_transform(X_train, y_train)
        X_test_selected = selector.transform(X_test)
        
        # 獲取選擇的特徵名稱
        selected_features = X_train.columns[selector.get_support()].tolist()
        
    elif method == 'tree_based':
        # 使用隨機森林特徵重要性
        rf = RandomForestClassifier(n_estimators=100, random_state=42)
        rf.fit(X_train, y_train)
        
        # 獲取特徵重要性
        importances = pd.DataFrame({
            'feature': X_train.columns,
            'importance': rf.feature_importances_
        })
        importances = importances.sort_values('importance', ascending=False)
        
        # 選擇前n個最重要特徵
        selected_features = importances['feature'].head(n_features).tolist()
        X_train_selected = X_train[selected_features]
        X_test_selected = X_test[selected_features]
    
    return X_train_selected, X_test_selected, selected_features

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.metrics import roc_auc_score
model = LogisticRegression(
    random_state=42,
    max_iter=1000,
    C=1.0,  # 調整這個值：較小的值增加正則化強度
    penalty='l2',  # 可以改用 'l1' 進行特徵選擇
    solver='lbfgs',  # 可以嘗試不同的優化器
    class_weight='balanced'
)
from sklearn.linear_model import LogisticRegression
# 訓練和評估模型
print("\n=== 模型訓練和驗證集評估 ===")
models = []
validation_aucs = []  # 儲存每個模型的驗證集AUC

for i in range(len(dataset_names)):
    # 分割訓練集和驗證集
    tmp_X_train, tmp_X_val, tmp_y_train, tmp_y_val = train_test_split(
        X_trains[i], y_trains[i], test_size=0.2, random_state=42
    )
    
    # 建立邏輯迴歸模型
    model = LogisticRegression(
        random_state=42,
        max_iter=1000,  # 增加迭代次數以確保收斂
        C=1.0,         # 正則化強度的倒數
        class_weight='balanced'  # 處理類別不平衡
    )
    
    # 訓練模型
    model.fit(tmp_X_train, tmp_y_train.squeeze())
    
    # 在驗證集上評估
    val_proba = model.predict_proba(tmp_X_val)[:, 1]
    val_auc = roc_auc_score(tmp_y_val, val_proba)
    validation_aucs.append(val_auc)
    
    print(f"\n{dataset_names[i]} 模型評估:")
    print(f"- 驗證集 AUC: {val_auc:.4f}")
    
    # 顯示特徵重要性
    feature_importance = pd.DataFrame({
        'Feature': X_trains[i].columns,
        'Importance': abs(model.coef_[0])
    }).sort_values('Importance', ascending=False)
    
    print("\n前5個最重要特徵:")
    print(feature_importance.head())
    
    models.append(model)

# 顯示所有模型的平均表現
print(f"\n=== 整體模型表現 ===")
print(f"平均驗證集 AUC: {np.mean(validation_aucs):.4f}")
print(f"最佳驗證集 AUC: {np.max(validation_aucs):.4f} (Dataset_{np.argmax(validation_aucs) + 1})")
print(f"最差驗證集 AUC: {np.min(validation_aucs):.4f} (Dataset_{np.argmin(validation_aucs) + 1})")

# 進行預測和儲存結果
print("\n=== 預測和儲存進度 ===")
y_predicts = []
for i in range(len(dataset_names)):
    # 獲取預測概率
    y_predict_proba = models[i].predict_proba(X_tests[i])[:, 1]
    df = pd.DataFrame(y_predict_proba, columns=['y_predict_proba'])
    y_predicts.append(df)
    
    # 儲存預測結果
    output_path = Path(base_path) / dataset_names[i] / "y_predict.csv"
    output_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(output_path, index=False, header=True)
    
    print(f"\n儲存 {dataset_names[i]} 的預測結果:")
    print(f"- 驗證集 AUC: {validation_aucs[i]:.4f}")
    print(f"- 預測數量: {len(df)} 筆")
    print(f"- 預測值範圍: {df['y_predict_proba'].min():.3f} 到 {df['y_predict_proba'].max():.3f}")
    print(f"- 儲存路徑: {output_path}")

# 模型表現比較
print("\n=== 模型表現排名 ===")
performance_df = pd.DataFrame({
    'Dataset': dataset_names,
    'Validation_AUC': validation_aucs
})
performance_df = performance_df.sort_values('Validation_AUC', ascending=False)
print("\n模型表現排名:")
print(performance_df)
y_predicts=[]
for i in range(len(dataset_names)):
    y_predict_proba=models[i].predict_proba(X_tests[i])[:, 1]
    df = pd.DataFrame(y_predict_proba, columns=['y_predict_proba'])
    y_predicts.append(df)
    print("\n=== 數據集列表 ===")
for i, name in enumerate(dataset_names):
    print(f"{i+1}. {name}")

print("\n=== 預測和儲存進度 ===")
for idx, dataset_name in enumerate(dataset_names):
    df = y_predicts[idx]
    output_path = Path(base_path) / dataset_name / "y_predict.csv"  
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    # 儲存預測結果
    df.to_csv(output_path, index=False, header=True)
    
    # 顯示儲存資訊
    print(f"\n儲存 {dataset_name} 的預測結果:")
    print(f"- 儲存路徑: {output_path}")
    print(f"- 預測數量: {len(df)} 筆")
    print(f"- 預測值範圍: {df['y_predict_proba'].min():.3f} 到 {df['y_predict_proba'].max():.3f}")
    print(f"- 檔案大小: {output_path.stat().st_size / 1024:.2f} KB")

print("\n=== 最終檢查 ===")
all_success = True
for dataset_name in dataset_names:
    check_path = Path(base_path) / dataset_name / "y_predict.csv"
    if check_path.exists():
        print(f"✓ {dataset_name} 預測檔案已成功儲存")
    else:
        print(f"✗ {dataset_name} 預測檔案未找到！")
        all_success = False

if all_success:
    print("\n所有數據集的預測結果都已成功儲存！")
else:
    print("\n警告：部分數據集的預測結果可能未正確儲存，請檢查上述訊息。")