In [1]:
# https://www.kaggle.com/competitions/drw-crypto-market-prediction
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import xgboost as xgb
import lightgbm as lgb
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from scipy.stats import pearsonr
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import warnings, os
warnings.filterwarnings("ignore")

# 设置随机种子
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [2]:
# 数据加载分析
def load_data(data_dir):
    print("加载数据...")
    train_file = os.path.join(data_dir, "train.parquet")
    test_file = os.path.join(data_dir, "test.parquet")
    submission_file = os.path.join(data_dir, "sample_submission.csv")
    train_df = pd.read_parquet(train_file)
    test_df = pd.read_parquet(test_file)
    sample_submission_df = pd.read_csv(submission_file)
    print(f"训练数据形状: {train_df.shape}")
    print(f"测试数据形状: {test_df.shape}")
    print(f"样例提交文件形状: {sample_submission_df.shape}")

    # 检查缺失值
    print("\n训练数据缺失值:")
    print(train_df.isnull().sum().sum())

    # 查看数据基本信息
    print("\n训练数据基本信息:")
    print(train_df.info())

    # 数据统计摘要
    print("\n数据统计摘要:")
    print(train_df.describe())

    return train_df, test_df, sample_submission_df

DATA_DIR = r"../data/kaggle-drw-crypto-market-prediction"
train_df, test_df, sample_submission_df = load_data(DATA_DIR)

加载数据...
训练数据形状: (525887, 896)
测试数据形状: (538150, 896)
样例提交文件形状: (538150, 2)

训练数据缺失值:
0

训练数据基本信息:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 525887 entries, 2023-03-01 00:00:00 to 2024-02-29 23:59:00
Columns: 896 entries, bid_qty to label
dtypes: float64(896)
memory usage: 3.5 GB
None

数据统计摘要:
             bid_qty        ask_qty        buy_qty       sell_qty  \
count  525887.000000  525887.000000  525887.000000  525887.000000   
mean        9.968003      10.174169     131.726678     132.673944   
std        15.645741      15.889582     307.267251     309.803040   
min         0.001000       0.001000       0.000000       0.000000   
25%         2.634000       2.678000      26.407000      27.021000   
50%         6.415000       6.538000      57.015000      58.047000   
75%        13.085000      13.330000     127.639000     129.110000   
max      1114.932000    1352.965000   17614.400000   17686.234000   

              volume             X1             X2             X3  \
count

In [10]:
print(train_df.head())

                     bid_qty  ask_qty  buy_qty  sell_qty   volume        X1  \
timestamp                                                                     
2023-03-01 00:00:00   15.283    8.425  176.405    44.984  221.389  0.121263   
2023-03-01 00:01:00   38.590    2.336  525.846   321.950  847.796  0.302841   
2023-03-01 00:02:00    0.442   60.250  159.227   136.369  295.596  0.167462   
2023-03-01 00:03:00    4.865   21.016  335.742   124.963  460.705  0.072944   
2023-03-01 00:04:00   27.158    3.451   98.411    44.407  142.818  0.173820   

                           X2        X3        X4        X5  ...      X882  \
timestamp                                                    ...             
2023-03-01 00:00:00 -0.417690  0.005399  0.125948  0.058359  ...  1.925423   
2023-03-01 00:01:00 -0.049576  0.356667  0.481087  0.237954  ...  1.928569   
2023-03-01 00:02:00 -0.291212  0.083138  0.206881  0.101727  ...  1.928047   
2023-03-01 00:03:00 -0.436590 -0.102483  0.017551  0.007

In [11]:
print(train_df.index.name)

timestamp


In [4]:
train_df.columns

Index(['bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume', 'X1', 'X2', 'X3',
       'X4', 'X5',
       ...
       'X882', 'X883', 'X884', 'X885', 'X886', 'X887', 'X888', 'X889', 'X890',
       'label'],
      dtype='object', length=896)

In [18]:
# 有“时间戳”列的表格做时间序列降采样
train_df_resampled = train_df.resample('1D').mean()
print(train_df_resampled.shape)
print(train_df_resampled.index.name)
train_df_resampled.to_csv(os.path.join(DATA_DIR, "train_resampled.csv"))

(366, 896)
timestamp


In [19]:
print(test_df.head())

    bid_qty  ask_qty  buy_qty  sell_qty   volume        X1        X2  \
ID                                                                     
1     0.114   12.121   10.587    10.971   21.558 -0.732818  0.512331   
2     2.426    2.962  136.241    12.304  148.545 -0.337995 -0.412176   
3     1.085    2.343   23.390    57.171   80.561  0.111249  0.458221   
4    14.793    1.117  116.518    13.082  129.600 -0.149399 -0.640638   
5     0.033   14.178   43.800    49.836   93.636 -0.694662  0.611254   

          X3        X4        X5  ...      X882      X883      X884      X885  \
ID                                ...                                           
1  -0.041982 -0.598260 -0.517646  ...  1.014336  1.367567  1.584126  1.584126   
2  -0.259468 -0.334809 -0.247443  ...  1.748939  1.848177  0.000131  0.000323   
3   0.466916  0.574081  0.324722  ...  1.704680  1.772028  0.000550  0.003597   
4  -0.873778 -1.026144 -0.508816  ...  0.930946  1.037839  1.382037  1.382037   
5   0.067

In [20]:
test_df.index.name

'ID'

In [23]:
print(test_df.columns)

Index(['bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume', 'X1', 'X2', 'X3',
       'X4', 'X5',
       ...
       'X882', 'X883', 'X884', 'X885', 'X886', 'X887', 'X888', 'X889', 'X890',
       'label'],
      dtype='object', length=896)


In [21]:
test_df_resampled = test_df.sample(n=1000, random_state=42)  # 随机抽样1000行
test_df_resampled.to_csv(os.path.join(DATA_DIR, "test_resampled.csv"))

In [22]:
print(sample_submission_df.head())

   ID  prediction
0   1   -0.280233
1   2    1.371969
2   3   -2.045252
3   4   -1.447555
4   5   -1.303901


In [None]:
# 特征工程数据预处理
def preprocess_data(train_df, test_df):
    print("\n数据预处理...")
    # 分离训练集的特征和标签
    X_train = train_df.drop(columns="label", axis=1)
    y_train = train_df["label"]

    # 分离测试集的特征和标签
    X_test = test_df.drop(columns="label", axis=1)
    y_test = test_df["label"]