In [3]:
from prj.config import DATA_DIR
from prj.data.data_loader import DataConfig, DataLoader
import polars as pl
from sklearn.metrics import r2_score
from prj.data.data_loader import PARTITIONS_DATE_INFO
import pandas as pd
import lightgbm as lgb
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import gc
from tqdm import tqdm
import numpy as np
from prj.utils import online_iterator
SEED = 42

In [None]:
data_args = {'include_time_id': True, 'include_intrastock_norm_temporal': False}
config = DataConfig(**data_args)
loader = DataLoader(data_dir=DATA_DIR, config=config)

start_dt, end_dt = 1530, 1600
complete_ds = loader.load(start_dt, end_dt)

train_ds = complete_ds.filter(pl.col('date_id').le(1580))
X_train, y_train, w_train, _ = loader._build_splits(train_ds)
train_data = lgb.Dataset(X_train, label=y_train)
del X_train, y_train, w_train
gc.collect()
test_ds = complete_ds.filter(pl.col('date_id').gt(1580))

In [9]:
model = lgb.train(
    train_set=train_data,
    params={'learning_rate': 0.05}
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.253980 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19492
[LightGBM] [Info] Number of data points in the train set: 1893408, number of used features: 80
[LightGBM] [Info] Start training from score 0.001139


In [14]:
X_test, y_test, w_test, _ = loader._build_splits(test_ds)

y_hat = model.predict(X_test).clip(-5, 5).flatten()
offline_score = r2_score(y_test, y_hat, sample_weight=w_test)
offline_score

-0.020834377508535784

In [15]:
del X_test
gc.collect()

4

In [16]:
features = loader.features
print(len(features))

80


In [9]:
MEAN_FEATURES = [0, 2, 3, 5, 6, 7, 18, 19, 34, 35, 36, 37, 38, 41, 43, 44, 48, 53, 55, 59, 62, 65, 68, 73, 74, 75, 76, 77, 78]
STD_FEATURES = [39, 42, 46, 53, 57, 66]
SKEW_FEATURES = [5, 40, 41, 42, 43, 44]
ZSCORE_FEATURES = [1, 36, 40, 45, 48, 49, 51, 52, 53, 54, 55, 59, 60]

def include_intrastock_norm(df: pl.LazyFrame, responder) -> pl.LazyFrame:
    df = df.with_columns(
        pl.col([f'feature_{j:02d}' for j in set(MEAN_FEATURES + ZSCORE_FEATURES)]).mean().over(['date_id', 'time_id', f'cluster_label_{responder}']).name.suffix(f'_{responder}_mean'),
        pl.col([f'feature_{j:02d}' for j in set(STD_FEATURES + ZSCORE_FEATURES)]).std().over(['date_id', 'time_id', f'cluster_label_{responder}']).name.suffix(f'_{responder}_std'),
        pl.col([f'feature_{j:02d}' for j in SKEW_FEATURES]).skew().over(['date_id', 'time_id', f'cluster_label_{responder}']).name.suffix(f'_{responder}_skew'),
    ).with_columns(
        pl.col(f'feature_{j:02d}').sub(f'feature_{j:02d}_{responder}_mean').truediv(f'feature_{j:02d}_{responder}_std').name.suffix(f'_{responder}_zscore') for j in ZSCORE_FEATURES
    ).drop([f'feature_{j:02d}_{responder}_std' for j in ZSCORE_FEATURES if j not in STD_FEATURES] + \
        [f'feature_{j:02d}_{responder}_mean' for j in ZSCORE_FEATURES if j not in MEAN_FEATURES])
    return df

# Inference

In [None]:
test_ds = test_ds.collect()

In [None]:
USE_INTRA_STOCK_NORM = True
USE_TIME_NORM_ID = True


corr_responder = 'responder_6'

period = loader.window_period
stock_cluster_mapping = {}
stock_max_time_id = {}
default_cluster=-1
default_max_time_id=967

replay_config = DataConfig()
replay_loader = DataLoader(data_dir=DATA_DIR, config=replay_config)
responder_replay_buffer = replay_loader.load(start_dt-1-period, start_dt-2).select('date_id', 'time_id', 'symbol_id', 'responder_6').collect()

def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    global responder_replay_buffer, stock_cluster_mapping, stock_max_time_id, default_max_time_id
    
    if lags is not None:
        curr_date = test['date_id'].first()
        
        # Time id norm preparation
        if USE_TIME_NORM_ID:
            stock_max_time_id_map = lags.group_by('symbol_id').agg(pl.col('time_id').max())
            stock_max_time_id = dict(zip(stock_max_time_id_map['symbol_id'], stock_max_time_id_map['time_id']))
            default_max_time_id = max(list(stock_max_time_id.values()))

        
        # Intrastock normalization preparation
        if USE_INTRA_STOCK_NORM:
            responder_replay_buffer = responder_replay_buffer.vstack(
                lags.select(pl.col('date_id').sub(1), 'time_id', 'symbol_id', pl.col(f'{corr_responder}_lag_1').alias(corr_responder))
            ).filter(pl.col('date_id').is_between(curr_date-period, curr_date))

            pivot = responder_replay_buffer.filter(pl.col('date_id') < curr_date)\
                        .pivot(index=['date_id', 'time_id'], values=[corr_responder], separator='_', on='symbol_id')\
                        .sort('date_id', 'time_id') \
                        .fill_nan(None)\
                        .fill_null(strategy='zero')

            corr_cols = [col for col in pivot.columns if col not in ['date_id', 'time_id']]
            stocks = [int(col) for col in corr_cols]
            df_corr_responder = pivot.select(corr_cols).corr()
            linked = linkage(df_corr_responder, method='ward')
            cluster_labels = fcluster(linked, t=2.5, criterion='distance')
            stock_cluster_mapping = dict(zip(stocks, cluster_labels))

    if USE_TIME_NORM_ID:
        test = test.with_columns(
            pl.col('symbol_id').replace_strict(
                stock_max_time_id, default=default_max_time_id, return_dtype=pl.Int16
            ).alias('max_prev_stock_time_id'),
        ).with_columns(
            pl.col('time_id').truediv(
                'max_prev_stock_time_id'
            ).alias('time_id_norm')
        ).drop('max_prev_stock_time_id')
        
    if USE_INTRA_STOCK_NORM:
        test = test.with_columns(
            pl.col('symbol_id').replace_strict(
                stock_cluster_mapping, default=default_cluster, return_dtype=pl.Int8
            ).alias(f'cluster_label_{corr_responder}')
        ).pipe(
            include_intrastock_norm, 
            corr_responder
        ).drop(f'cluster_label_{corr_responder}')
    
    
    
    X = test.select(features).cast(pl.Float32).to_numpy()
    y_hat = model.predict(X, task_type='GPU').clip(-5, 5).flatten()

    predictions = test.select('row_id', pl.Series(y_hat).alias('responder_6'))


    assert len(predictions) == len(test)

    return predictions



y_hat_iterator = []

for test, lags in online_iterator(test_ds, show_progress=True):
    res = predict(test, lags)
    y_hat_iterator.append(res['responder_6'].to_numpy())

y_hat_iterator = np.concatenate(y_hat_iterator)

r2_score(y_true=y_hat, y_pred=y_hat_iterator, sample_weight=w_test)


 67%|██████▋   | 110104/163592 [21:56<10:39, 83.68it/s] 

In [27]:
for i in range(len(y_hat)):
    if y_hat[i] != y_hat_iterator[i]:
        print(i, y_hat[i], y_hat_iterator[i])

0 -0.03899233464068175 -0.038992334390059114
1 0.06370154570097952 0.06370154372416437
2 0.04585015447921137 0.04585015377961099
3 -0.037372863503975395 -0.037372861290350556
4 0.05724305218197401 0.05724305263720453
5 0.0868973143534788 0.08689731708727777
6 0.03840196819916741 0.038401968544349074
7 0.020591817328380557 0.020591817563399673
8 0.07228725069560937 0.07228725007735193
9 0.01671117094156216 0.016711171483621
10 0.03131460976607059 0.03131461120210588
11 0.058220748637069164 0.05822074762545526
12 -0.0890649879878822 -0.08906498947180808
13 0.0483706209995205 0.04837062186561525
14 0.0055129656981876 0.005512966541573405
15 0.06607453157795362 0.06607453129254282
16 0.03950028436807429 0.039500284707173705
17 0.0830991846344773 0.08309918246231973
18 -0.04478291445983018 -0.04478291445411742
19 -0.05299656148255849 -0.05299656116403639
20 -0.014638919854718324 -0.014638919616118073
21 0.059502221768298114 0.05950222141109407
22 0.007570058095183185 0.007570057874545455
23