In [1]:
import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
from pathlib import Path
import gc
from tqdm import tqdm
import math
from lightgbm import LGBMRegressor, plot_importance
from sklearn.metrics import r2_score
from prj.config import DATA_DIR
from prj.data.data_loader import DataConfig, DataLoader

2025-01-04 09:05:22.385466: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-04 09:05:22.385500: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-04 09:05:22.386598: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-04 09:05:22.393213: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def evaluate_model(model, X_train, y_train, X_val, y_val, weights, feature_names, cat_features=[]):   
    cat_features_idx = [feature_names.index(f) for f in cat_features]
    if len(cat_features_idx) > 0:
        print(f'Using categorical features: {cat_features_idx}')
    model.fit(X_train, y_train, feature_name=feature_names, categorical_feature=','.join([str(c) for c in cat_features_idx]))
    pred_val = model.predict(X_val).clip(-5, 5)
    return r2_score(y_val, pred_val, sample_weight=weights)



In [3]:
params = {'n_estimators': 200, 'max_depth': 3, 'num_leaves': 8, 'learning_rate': 5e-2}

In [4]:
data_args = {}
config = DataConfig(**data_args)
loader = DataLoader(data_dir=DATA_DIR, config=config)

complete_ds = loader.load_with_partition(8, 9)
train_ds = complete_ds.filter(pl.col('partition_id').eq(8))
val_ds = complete_ds.filter(pl.col('partition_id').eq(9))

X_train, y_train, w_train, _ = loader._build_splits(train_ds)
X_val, y_val, w_val, _ = loader._build_splits(val_ds)

model = LGBMRegressor(**params)

evaluate_model(model, X_train, y_train, X_val, y_val, w_val, loader.features)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.448876 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19354
[LightGBM] [Info] Number of data points in the train set: 6140024, number of used features: 79
[LightGBM] [Info] Start training from score 0.001341


0.004925303885731869

In [5]:
data_args = {'include_symbol_id': True}
config = DataConfig(**data_args)
loader = DataLoader(data_dir=DATA_DIR, config=config)

complete_ds = loader.load_with_partition(8, 9)
train_ds = complete_ds.filter(pl.col('partition_id').eq(8))
val_ds = complete_ds.filter(pl.col('partition_id').eq(9))

X_train, y_train, w_train, _ = loader._build_splits(train_ds)
X_val, y_val, w_val, _ = loader._build_splits(val_ds)

model = LGBMRegressor(**params)

evaluate_model(model, X_train, y_train, X_val, y_val, w_val, loader.features)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.467964 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19393
[LightGBM] [Info] Number of data points in the train set: 6140024, number of used features: 80
[LightGBM] [Info] Start training from score 0.001341


0.005006532263017727

In [6]:
data_args = {'include_time_id': True, 'include_symbol_id': True}
config = DataConfig(**data_args)
loader = DataLoader(data_dir=DATA_DIR, config=config)

complete_ds = loader.load_with_partition(8, 9)
train_ds = complete_ds.filter(pl.col('partition_id').eq(8))
val_ds = complete_ds.filter(pl.col('partition_id').eq(9))

X_train, y_train, w_train, _ = loader._build_splits(train_ds)
X_val, y_val, w_val, _ = loader._build_splits(val_ds)

model = LGBMRegressor(**params)

evaluate_model(model, X_train, y_train, X_val, y_val, w_val, loader.features)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.353123 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19648
[LightGBM] [Info] Number of data points in the train set: 6140024, number of used features: 81
[LightGBM] [Info] Start training from score 0.001341


0.005115153483336354

In [7]:
data_args = {'include_time_id': True}
config = DataConfig(**data_args)
loader = DataLoader(data_dir=DATA_DIR, config=config)

complete_ds = loader.load_with_partition(8, 9)
train_ds = complete_ds.filter(pl.col('partition_id').eq(8))
val_ds = complete_ds.filter(pl.col('partition_id').eq(9))

X_train, y_train, w_train, _ = loader._build_splits(train_ds)
X_val, y_val, w_val, _ = loader._build_splits(val_ds)

model = LGBMRegressor(**params)

evaluate_model(model, X_train, y_train, X_val, y_val, w_val, loader.features)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.250700 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19609
[LightGBM] [Info] Number of data points in the train set: 6140024, number of used features: 80
[LightGBM] [Info] Start training from score 0.001341


0.005132033405778502

In [8]:
data_args = {'include_time_id': True, 'include_intrastock_norm_temporal': True, 'include_symbol_id': True}  
config = DataConfig(**data_args)
loader = DataLoader(data_dir=DATA_DIR, config=config)

complete_ds = loader.load_with_partition(8, 9)
train_ds = complete_ds.filter(pl.col('partition_id').eq(8))
val_ds = complete_ds.filter(pl.col('partition_id').eq(9))

X_train, y_train, w_train, _ = loader._build_splits(train_ds)
X_val, y_val, w_val, _ = loader._build_splits(val_ds)

model = LGBMRegressor(**params)

evaluate_model(model, X_train, y_train, X_val, y_val, w_val, loader.features)

100%|██████████| 346/346 [00:21<00:00, 16.28it/s]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.524577 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 33418
[LightGBM] [Info] Number of data points in the train set: 6140024, number of used features: 135
[LightGBM] [Info] Start training from score 0.001341


0.005789860094759236

In [9]:
data_args = {'include_time_id': True, 'include_intrastock_norm_temporal': True, 'include_symbol_id': False}  
config = DataConfig(**data_args)
loader = DataLoader(data_dir=DATA_DIR, config=config)

complete_ds = loader.load_with_partition(8, 9)
train_ds = complete_ds.filter(pl.col('partition_id').eq(8))
val_ds = complete_ds.filter(pl.col('partition_id').eq(9))

X_train, y_train, w_train, _ = loader._build_splits(train_ds)
X_val, y_val, w_val, _ = loader._build_splits(val_ds)

model = LGBMRegressor(**params)

evaluate_model(model, X_train, y_train, X_val, y_val, w_val, loader.features)

100%|██████████| 346/346 [00:20<00:00, 16.83it/s]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.293371 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 33379
[LightGBM] [Info] Number of data points in the train set: 6140024, number of used features: 134
[LightGBM] [Info] Start training from score 0.001341


0.005880807434581414