<a href="https://colab.research.google.com/github/nanpolend/machine-learning/blob/master/%E7%95%B6%E5%89%8D%E7%AC%AC1%E5%90%8D%EF%BC%9AJane_Street%EF%BC%9AAE_MLP%2Bxgb_Gemini%E4%BF%AE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# -*- coding: utf-8 -*-
"""Jane Street Market Prediction 完整範例程式碼"""
import warnings
warnings.filterwarnings('ignore')

# 資料處理與機器學習套件
!pip install janestreet  # 安裝janestreet套件
import os, gc
import cudf
import pandas as pd
import numpy as np
import cupy as cp
import janestreet

# 模型與評估
import xgboost as xgb
from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import GroupKFold

# 深度學習相關
import tensorflow as tf
tf.random.set_seed(2212)
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Activation, GaussianNoise
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

# 輔助工具
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from joblib import dump, load

# 環境設定
TEST = False  # 設為False執行完整訓練

# ========== 自定義函數與類別 ==========
class PurgedGroupTimeSeriesSplit:
    """時間序列交叉驗證實現"""
    def __init__(self, n_splits=5, group_gap=31):
        self.n_splits = n_splits
        self.group_gap = group_gap

    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        indices = np.arange(n_samples)
        group_array = np.array(groups)
        unique_groups = np.unique(group_array)
        n_groups = len(unique_groups)
        gap_size = self.group_gap * n_groups // self.n_splits
        split_idx = []
        for i in range(self.n_splits):
            test_group_idx = np.arange(i * n_groups // self.n_splits, (i + 1) * n_groups // self.n_splits)
            test_groups = unique_groups[test_group_idx]
            test_idx = np.where(np.isin(group_array, test_groups))[0]
            train_groups = np.setdiff1d(unique_groups, test_groups)
            train_groups = train_groups[train_groups < np.min(test_groups) - gap_size]
            train_idx = np.where(np.isin(group_array, train_groups))[0]
            split_idx.append((train_idx, test_idx))
        return split_idx

def weighted_average(a):
    """加權平均計算"""
    w = [1/(2**(len(a)+1-j)) if j>1 else 1/(2**len(a)) for j in range(1,len(a)+1)]
    return np.average(a, weights=w)

def reduce_mem_usage(df):
    """記憶體優化函數"""
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print(f'記憶體用量從 {start_mem:.2f} MB 減少至 {end_mem:.2f} MB')
    return df

def create_ae_mlp(num_columns, num_labels, hidden_units, dropout_rates, ls=1e-2, lr=1e-3):
    """自編碼器+MLP混合模型"""
    # 輸入層
    inp = Input(shape=(num_columns,))
    x0 = BatchNormalization()(inp)

    # 自編碼器部分
    encoder = GaussianNoise(0.1)(x0)
    encoder = Dense(hidden_units[0])(encoder)
    encoder = BatchNormalization()(encoder)
    encoder = Activation('swish')(encoder)

    # 解碼器
    decoder = Dropout(dropout_rates[0])(encoder)
    decoder = Dense(num_columns, name='decoder')(decoder)

    # 多任務學習頭
    x = K.concatenate([x0, encoder])
    x = BatchNormalization()(x)
    x = Dropout(dropout_rates[1])(x)

    # 隱藏層堆疊
    for i in range(2, len(hidden_units)):
        x = Dense(hidden_units[i])(x)
        x = BatchNormalization()(x)
        x = Activation('swish')(x)
        x = Dropout(dropout_rates[i])(x)

    # 輸出層
    outputs = []
    for _ in range(num_labels):
        outputs.append(Dense(1, activation='sigmoid')(x))

    # 模型編譯
    model = Model(inputs=inp, outputs=[decoder] + outputs)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
        loss=['mse'] + [tf.keras.losses.BinaryCrossentropy(label_smoothing=ls) for _ in range(num_labels)],
        metrics=[tf.keras.metrics.AUC(name='auc')]
    )
    return model

# ========== 主要執行流程 ==========
if __name__ == "__main__":
    # 資料載入與預處理
    print('\n[1/5] 載入資料...')
    train = cudf.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
    features = [c for c in train.columns if 'feature' in c]
    train = train.to_pandas()

    print('\n[2/5] 資料清洗...')
    train = train.query('date > 85 and weight > 0').reset_index(drop=True)
    train[features] = train[features].fillna(method='ffill').fillna(0)
    train['action'] = ((train[['resp','resp_1','resp_2','resp_3','resp_4']] > 0).all(axis=1)).astype(int)
    resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4']

    # 特徵工程
    print('\n[3/5] 特徵工程...')
    X = train[features].values
    y = np.stack([(train[c] > 0).astype(int) for c in resp_cols]).T
    sw = np.mean(np.abs(train[resp_cols].values), axis=1)

    # 模型參數
    params = {
        'num_columns': len(features),
        'num_labels': 5,
        'hidden_units': [96, 96, 896, 448, 448, 256],
        'dropout_rates': [0.035, 0.038, 0.424, 0.104, 0.492, 0.320, 0.272, 0.438],
        'ls': 0,
        'lr': 1e-3,
    }

    # 訓練流程
    if not TEST:
        print('\n[4/5] 開始訓練...')
        scores, models = [], []
        gkf = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=31)

        for fold, (tr_idx, val_idx) in enumerate(gkf.split(X, groups=train['date'])):
            print(f'\n--- Fold {fold+1}/5 ---')
            X_tr, X_val = X[tr_idx], X[val_idx]
            y_tr, y_val = y[tr_idx], y[val_idx]

            # 模型初始化
            model = create_ae_mlp(**params)
            checkpoint = ModelCheckpoint(f'best_model_fold{fold}.h5', save_best_only=True, monitor='val_auc', mode='max')
            early_stop = EarlyStopping(monitor='val_auc', patience=5, restore_best_weights=True, mode='max')

            # 訓練執行
            history = model.fit(
                X_tr, [X_tr] + [y_tr[:,i] for i in range(5)],
                validation_data=(X_val, [X_val] + [y_val[:,i] for i in range(5)]),
                epochs=100,
                batch_size=4096,
                callbacks=[checkpoint, early_stop],
                verbose=1
            )

            # 模型評估
            model.load_weights(f'best_model_fold{fold}.h5')
            val_pred = np.mean([model.predict(X_val, batch_size=4096)[i+1] for i in range(5)], axis=0)
            score = roc_auc_score(y_val.mean(axis=1), val_pred)
            scores.append(score)
            models.append(model)
            print(f'Fold {fold+1} AUC: {score:.5f}')
            gc.collect()

        # 最終評估
        print('\n[5/5] 訓練完成')
        print(f'各Fold分數: {scores}')
        print(f'加權平均分數: {weighted_average(scores):.5f}')

[31mERROR: Could not find a version that satisfies the requirement janestreet (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for janestreet[0m[31m
[0m

ModuleNotFoundError: No module named 'janestreet'