In [1]:
import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
from pathlib import Path
import gc
from tqdm import tqdm
import math
from lightgbm import LGBMRegressor, plot_importance
from sklearn.metrics import r2_score
from prj.config import DATA_DIR
from prj.data.data_loader import DataConfig, DataLoader

2024-12-23 11:06:10.909167: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-23 11:06:10.909425: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-23 11:06:11.013800: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-23 11:06:11.239831: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def evaluate_model(model, X_train, y_train, X_val, y_val, weights, feature_names, cat_features=[]):   
    cat_features_idx = [feature_names.index(f) for f in cat_features]
    if len(cat_features_idx) > 0:
        print(f'Using categorical features: {cat_features_idx}')
    model.fit(X_train, y_train, feature_name=feature_names, categorical_feature=','.join([str(c) for c in cat_features_idx]))
    pred_val = model.predict(X_val).clip(-5, 5)
    return r2_score(y_val, pred_val, sample_weight=weights)



In [None]:
params = {'n_estimators': 200, 'max_depth': 3, 'num_leaves': 8, 'learning_rate': 5e-2}

In [30]:
data_args = {'zero_fill': False, 'ffill': False, 'include_intrastock_norm': True}
config = DataConfig(**data_args)
loader = DataLoader(data_dir=DATA_DIR, config=config)

train_ds, val_ds = loader.load_train_and_val(start_dt=1100, end_dt=1200, val_ratio=0.2)

X_train, y_train, w_train, _ = loader._build_splits(train_ds)
X_val, y_val, w_val, _ = loader._build_splits(val_ds)

In [31]:
model = LGBMRegressor(**params)

evaluate_model(model, X_train, y_train, X_val, y_val, w_val, loader.features)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.191975 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 33034
[LightGBM] [Info] Number of data points in the train set: 3016288, number of used features: 133
[LightGBM] [Info] Start training from score -0.003544


0.009591148340158062

In [32]:
model = LGBMRegressor(**params)

evaluate_model(model, X_train, y_train, X_val, y_val, w_val, loader.features, loader.categorical_features)

Using categorical features: [9, 10, 11]
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.192719 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 33034
[LightGBM] [Info] Number of data points in the train set: 3016288, number of used features: 133
[LightGBM] [Info] Start training from score -0.003544


0.009591148340157951

In [None]:
data_args = {'include_lags': True}
config = DataConfig(**data_args)
loader = DataLoader(data_dir=DATA_DIR, config=config)

train_ds, val_ds = loader.load_train_and_val(start_dt=1100, end_dt=1200, val_ratio=0.2)

X_train, y_train, w_train, _ = loader._build_splits(train_ds)
X_val, y_val, w_val, _ = loader._build_splits(val_ds)

In [None]:
model = LGBMRegressor(**params)

evaluate_model(model, X_train, y_train, X_val, y_val, w_val, loader.features)

In [None]:
data_args = {'include_lags': True}
config = DataConfig(**data_args)
loader = DataLoader(data_dir=DATA_DIR, config=config)

train_ds, val_ds = loader.load_train_and_val(start_dt=1100, end_dt=1200, val_ratio=0.2)

In [None]:
target = 'responder_6'
schema = {'date_id': pl.UInt32, 'time_id': pl.UInt32, 'datetime': pl.Datetime, target: pl.Float32}
schema.update({
    f'{target}_knn_{n}_{persistence}_{agg}': pl.Float32 for n in knn_windows for agg in aggs
})
n_dates = _df['date_id'].unique().count()
num_groups = max(0, n_dates - period + 1)
with tqdm(total=int(num_groups)) as pbar:
    _df = _df.sort('datetime').group_by_dynamic(
        pl.col('date_id').cast(pl.Int64),
        period=f"{period}i",
        every="1i",
        closed='both',
    ).map_groups(
        wrapper_pbar(
            pbar, 
            lambda x: _knn_features(x, period=period, knn_windows=knn_windows, features=features, target=target, schema=schema, persistence=persistence, aggs=aggs)
        ),
        schema=schema
    ).sort('datetime').drop(target, 'date_id', 'time_id')

df = df.join(
    _df, on='datetime', how='left'
)