# Libraries

In [1]:
import pandas as pd
import polars as pl
import numpy as np
import gc
from matplotlib import pyplot as plt
import matplotlib.cm as cm
from sklearn.model_selection import StratifiedGroupKFold

In [2]:
from prj.data.data_loader import DataConfig, DataLoader

config = DataConfig(
    include_lags=False,
    ffill=False,
    zero_fill=True
    
)
loader = DataLoader(config=config)

2024-12-17 23:00:40.897903: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-17 23:00:40.897939: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-17 23:00:40.899336: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-17 23:00:40.906219: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
train, val = loader.load_numpy_train_and_val(start_dt=1100, val_ratio=0.2)

X_train, y_train, w_train, info_train = train
X_val, y_val, w_val, info_val = val

X_train.shape, X_val.shape

((20573872, 79), (1530408, 79))

# Configurations

In [4]:
class CONFIG:
    target_col = "responder_6"
    lag_cols_original = ["date_id", "symbol_id"] + [f"responder_{idx}" for idx in range(9)]
    lag_cols_rename = { f"responder_{idx}" : f"responder_{idx}_lag_1" for idx in range(9)}
    valid_ratio = 0.05
    start_dt = 1100

# Load training data

In [11]:
# Use last 2 parquets
from prj.config import DATA_DIR


train = pl.concat([pl.scan_parquet(
    DATA_DIR / 'partition_id=0' / f'part-{i}.parquet'
) for i in range(10)
]).select(
    pl.int_range(pl.len(), dtype=pl.UInt32).alias("id"),
    pl.all(),
).with_columns(
    (pl.col(CONFIG.target_col)*2).cast(pl.Int32).alias("label"),
).filter(
    pl.col("date_id").gt(CONFIG.start_dt)
)

# Create Lags data from training data

In [None]:
lags = train.select(pl.col(CONFIG.lag_cols_original))
lags = lags.rename(CONFIG.lag_cols_rename)
lags = lags.with_columns(
    date_id = pl.col('date_id') + 1,  # lagged by 1 day
    )
lags = lags.group_by(["date_id", "symbol_id"], maintain_order=True).last()  # pick up last record of previous date
lags

# Merge training data and lags data

In [None]:
train = train.join(lags, on=["date_id", "symbol_id"],  how="left")
train

# Split training data and validation data

In [None]:
len_train   = train.select(pl.col("date_id")).collect().shape[0]
valid_records = int(len_train * CONFIG.valid_ratio)
len_ofl_mdl = len_train - valid_records
last_tr_dt  = train.select(pl.col("date_id")).collect().row(len_ofl_mdl)[0]

print(f"\n len_train = {len_train}")
print(f"\n len_ofl_mdl = {len_ofl_mdl}")
print(f"\n---> Last offline train date = {last_tr_dt}\n")

training_data = train.filter(pl.col("date_id").le(last_tr_dt))
validation_data   = train.filter(pl.col("date_id").gt(last_tr_dt))

In [None]:
validation_data

# Save data as parquets

In [None]:
training_data.collect().\
write_parquet(
    f"training.parquet", partition_by = "date_id",
)

In [None]:
validation_data.collect().\
write_parquet(
    "validation.parquet", partition_by = "date_id",
)