<a href="https://colab.research.google.com/github/nanpolend/machine-learning/blob/master/Jane_street_ai%E6%AF%94%E8%B3%BD%E4%BB%A3%E7%A2%BCgpt4o%E7%89%88Gemini%E6%94%B9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import os

def load_data(file_path):
    """讀取數據"""
    data = pd.read_csv(file_path)
    return data

def preprocess_data(data):
    """數據預處理"""
    # 處理缺失值
    data.fillna(data.mean(), inplace=True)
    return data

def feature_engineering(data):
    """特徵工程"""
    # 選擇特徵
    features = [col for col in data.columns if 'feature' in col]
    X = data[features]
    y = data['action']
    return X, y

def scale_features(X):
    """標準化特徵"""
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

def train_model(X_train, y_train):
    """訓練模型"""
    model = XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """評估模型"""
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f'Accuracy: {accuracy:.2f}')
    print(f'F1 Score: {f1:.2f}')
    return y_pred

def plot_confusion_matrix(y_test, y_pred):
    """繪製混淆矩陣"""
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

def main():
    # 讀取和處理數據
    # 获取当前脚本所在的目录
    # If __file__ is not defined, use the current working directory instead
    try:
        current_dir = os.path.dirname(os.path.abspath(__file__))
    except NameError:
        current_dir = os.getcwd()  # Use current working directory

    # 构建完整的文件路径
    file_path = os.path.join(current_dir, 'train.csv')
    data = load_data(file_path)
    data = preprocess_data(data)

    # 特徵工程
    X, y = feature_engineering(data)
    X_scaled = scale_features(X)

    # 分割數據集
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # 訓練模型
    model = train_model(X_train, y_train)

    # 評估模型
    y_pred = evaluate_model(model, X_test, y_test)

    # 繪製混淆矩陣
    plot_confusion_matrix(y_test, y_pred)

if __name__ == "__main__":
    main()

FileNotFoundError: [Errno 2] No such file or directory: '/content/train.csv'