In [1]:
import polars as pl
import numpy as np
import os
import gc

USE_GPU = True
if not USE_GPU:
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [2]:
from prj.config import DATA_DIR
from prj.data.data_loader import DataConfig, DataLoader

data_args = {'zero_fill': True, 'include_intrastock_norm': True}
config = DataConfig(**data_args)
loader = DataLoader(data_dir=DATA_DIR, config=config)

2024-12-22 11:51:36.174192: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-22 11:51:36.174249: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-22 11:51:36.175600: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-22 11:51:36.185791: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
df = loader.load(1300, 1400)

In [4]:
loader.features

['feature_00',
 'feature_01',
 'feature_02',
 'feature_03',
 'feature_04',
 'feature_05',
 'feature_06',
 'feature_07',
 'feature_08',
 'feature_09',
 'feature_10',
 'feature_11',
 'feature_12',
 'feature_13',
 'feature_14',
 'feature_15',
 'feature_16',
 'feature_17',
 'feature_18',
 'feature_19',
 'feature_20',
 'feature_21',
 'feature_22',
 'feature_23',
 'feature_24',
 'feature_25',
 'feature_26',
 'feature_27',
 'feature_28',
 'feature_29',
 'feature_30',
 'feature_31',
 'feature_32',
 'feature_33',
 'feature_34',
 'feature_35',
 'feature_36',
 'feature_37',
 'feature_38',
 'feature_39',
 'feature_40',
 'feature_41',
 'feature_42',
 'feature_43',
 'feature_44',
 'feature_45',
 'feature_46',
 'feature_47',
 'feature_48',
 'feature_49',
 'feature_50',
 'feature_51',
 'feature_52',
 'feature_53',
 'feature_54',
 'feature_55',
 'feature_56',
 'feature_57',
 'feature_58',
 'feature_59',
 'feature_60',
 'feature_61',
 'feature_62',
 'feature_63',
 'feature_64',
 'feature_65',
 'feature_

In [3]:
from prj.data.data_loader import PARTITIONS_DATE_INFO
# start_dt, end_dt = 1020, 1300
start_dt, end_dt = PARTITIONS_DATE_INFO[8]['min_date'], PARTITIONS_DATE_INFO[8]['max_date']
val_ratio = 0.2
es_ratio = 0.1
early_stopping = True

train_ds = loader.load(start_dt, end_dt)
val_ds = loader.load_with_partition(start_part_id=9, end_part_id=9)
 
es_ds = None
if early_stopping:
    train_dates = train_ds.select('date_id').unique().collect().to_series().sort()
    split_point = int(len(train_dates) * (1 - es_ratio))
    split_date = train_dates[split_point]
    es_ds = train_ds.filter(pl.col('date_id').ge(split_date))
    train_ds = train_ds.filter(pl.col('date_id').lt(split_date))

n_rows_train = train_ds.select(pl.len()).collect().item()
n_dates_train = train_ds.select('date_id').unique().collect().count().item()
n_rows_es = es_ds.select(pl.len()).collect().item() if early_stopping else 0
n_dates_es = es_ds.select('date_id').unique().collect().count().item() if early_stopping else 0
n_rows_val = val_ds.select(pl.len()).collect().item()
n_dates_val = val_ds.select('date_id').unique().collect().count().item()
print(f'N rows train: {n_rows_train}, ES: {n_rows_es}, VAL: {n_rows_val}')
print(f'N dates train: {n_dates_train}, ES: {n_dates_es}, VAL: {n_dates_val}')

N rows train: 5502112, ES: 637912, VAL: 6274576
N dates train: 153, ES: 17, VAL: 169


In [4]:
from prj.model.keras.mlp import Mlp


model = Mlp(
    input_dim=(len(loader.features),),
    hidden_units=[128, 64, 32],
    use_gaussian_noise=True,
    use_batch_norm=True,
    use_dropout=True,
    dropout_rate=0.1,
)

In [5]:
X_train, y_train, w_train, _ = loader._build_splits(train_ds)
X_es, y_es, w_es, _ = loader._build_splits(es_ds) if early_stopping else (None, None, None, None)
X_train.shape, y_train.shape, w_train.shape, X_es.shape, y_es.shape, w_es.shape

((5502112, 79), (5502112,), (5502112,), (637912, 79), (637912,), (637912,))

In [6]:
from keras import optimizers as tfko
from keras import metrics as tfkm
from keras import callbacks as tfkc


optimizer = tfko.Adam(learning_rate=1e-4)
loss = 'mse'
metrics = [tfkm.R2Score(), tfkm.MeanSquaredError()]
batch_size = 1024


model.fit(
    X_train, y_train,
    sample_weight=w_train,
    validation_data=(X_es, y_es, w_es) if early_stopping else None,
    batch_size=batch_size,
    epochs=50,
    loss=loss,
    optimizer=optimizer,
    metrics=metrics,
)

2024-12-19 16:15:34.890778: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-12-19 16:15:34.892039: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-12-19 16:15:34.892263: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Training with early stopping patience 5
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 79)]              0         
                                                                 
 gaussian_noise (GaussianNo  (None, 79)                0         
 ise)                                                            
                                                                 
 dense_0 (Dense)             (None, 128)               10240     
                                                                 
 batch_normalization (Batch  (None, 128)               512       
 Normalization)                                                  
                                                                 
 activation (Activation)     (None, 128)               0         
                                                                 
 dropout (Dropout)   

2024-12-19 16:15:40.973631: I external/local_xla/xla/service/service.cc:168] XLA service 0x70cc056065c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-12-19 16:15:40.973663: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce GTX 960, Compute Capability 5.2
2024-12-19 16:15:40.996798: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-12-19 16:15:41.058238: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
I0000 00:00:1734621341.125599   72342 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Fit complete after 19


In [7]:
del X_train, y_train, w_train, X_es, y_es, w_es
gc.collect()

1692

In [13]:
import tensorflow as tf
X_val, y_val, w_val, _ = loader._build_splits(val_ds)
with tf.device('/CPU:0'):
    y_hat = model.predict(X_val, batch_size=batch_size)
y_val.shape, y_hat.shape


  25/6128 [..............................] - ETA: 12s  

2024-12-19 16:30:23.033774: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1982766016 exceeds 10% of free system memory.




((6274576,), (6274576,))

In [14]:
from prj.metrics import weighted_mae, weighted_mse, weighted_r2, weighted_rmse

{
    'r2_w': weighted_r2(y_val, y_hat, weights=w_val),
    'mae_w': weighted_mae(y_val, y_hat, weights=w_val),
    'mse_w': weighted_mse(y_val, y_hat, weights=w_val),
    'rmse_w': weighted_rmse(y_val, y_hat, weights=w_val),
}

{'r2_w': 0.004648387432098389,
 'mae_w': 0.5207134,
 'mse_w': 0.63103724,
 'rmse_w': 0.7943785}

: 