In [1]:
import polars as pl
import numpy as np
import os
import gc
from prj.config import EXP_DIR


USE_GPU = True
if not USE_GPU:
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [2]:
from prj.config import DATA_DIR
from prj.data.data_loader import DataConfig, DataLoader

data_args = {}
config = DataConfig(**data_args)
loader = DataLoader(data_dir=DATA_DIR, config=config)

2024-12-29 17:07:29.540206: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-29 17:07:29.540240: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-29 17:07:29.541575: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-29 17:07:29.548474: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
start_dt, end_dt = 1360, 1698
start_val_date = 1530

start_dt, end_dt = 1360, 1400
start_val_date = 1390

complete_ds = loader.load(start_dt, end_dt)
features = loader.features

In [4]:
train_ds = complete_ds.filter(pl.col('date_id').lt(start_val_date))
val_ds = complete_ds.filter(pl.col('date_id').ge(start_val_date))

es_ratio = 0.15
dates_train = train_ds.select('date_id').collect().to_series().unique().sort().to_numpy()
es_dates = dates_train[int(len(dates_train)*(1-es_ratio)):]

es_ds = train_ds.filter(pl.col('date_id').is_in(es_dates))
train_ds = train_ds.filter(~pl.col('date_id').is_in(es_dates))

means = train_ds.select('symbol_id', *features).group_by('symbol_id').agg(pl.all().mean().name.suffix('_global_mean')).collect()
stds = train_ds.select('symbol_id', *features).group_by('symbol_id').agg(pl.all().std(ddof=0).name.suffix('_global_std')).collect()
eps = 1e-8

train_ds = train_ds.join(means.lazy(), on='symbol_id', how='left', maintain_order='left').join(stds.lazy(), on='symbol_id', how='left', maintain_order='left').with_columns(
    (pl.col(f).sub(pl.col(f'{f}_global_mean'))).truediv(pl.col(f'{f}_global_std').add(eps)) for f in loader.features
).drop([f'{f}_global_mean' for f in loader.features] + [f'{f}_global_std' for f in loader.features]).fill_nan(None).fill_null(strategy='zero')

es_ds = es_ds.join(means.lazy(), on='symbol_id', how='left', maintain_order='left').join(stds.lazy(), on='symbol_id', how='left', maintain_order='left').with_columns(
    (pl.col(f).sub(pl.col(f'{f}_global_mean'))).truediv(pl.col(f'{f}_global_std').add(eps)) for f in loader.features
).drop([f'{f}_global_mean' for f in loader.features] + [f'{f}_global_std' for f in loader.features]).fill_nan(None).fill_null(strategy='zero')

val_ds = val_ds.join(means.lazy(), on='symbol_id', how='left', maintain_order='left').join(stds.lazy(), on='symbol_id', how='left', maintain_order='left').with_columns(
    (pl.col(f).sub(pl.col(f'{f}_global_mean'))).truediv(pl.col(f'{f}_global_std').add(eps)) for f in loader.features
).drop([f'{f}_global_mean' for f in loader.features] + [f'{f}_global_std' for f in loader.features]).fill_nan(None).fill_null(strategy='zero')



X_train, y_train, w_train, _ = loader._build_splits(train_ds)
X_es, y_es, w_es, _ = loader._build_splits(es_ds)
X_val, y_val, w_val, _ = loader._build_splits(val_ds)

X_train.shape, X_es.shape, X_val.shape

((923472, 79), (168432, 79), (366872, 79))

In [5]:
import time
from keras import optimizers as tfko
from keras import metrics as tfkm
from keras import callbacks as tfkc
from prj.model.keras.mlp import Mlp, SimpleNNModel


# model = Mlp(
#     input_dim=(len(loader.features),),
#     hidden_units=[512, 256],
#     use_gaussian_noise=False,
#     use_batch_norm=False,
#     use_dropout=True,
#     dropout_rate=0.1,
# )

model = SimpleNNModel(
    input_dim=(len(loader.features),),
    hidden_units=[512, 256],
    use_gaussian_noise=False,
    use_batch_norm=False,
    use_dropout=True,
    dropout_rate=0.1,
    use_tanh=True,
    final_mult=5.0
)


optimizer = tfko.Adam(learning_rate=1e-4)
loss = 'mse'
metrics = [tfkm.R2Score(), tfkm.MeanSquaredError()]
batch_size = 1024
lr_scheduler = tfkc.ReduceLROnPlateau(
    monitor='val_loss',
    patience=5,
    verbose=1
)

WEIGHTED_LOSS = False

model.fit(
    X_train, y_train,
    sample_weight=w_train if WEIGHTED_LOSS else None,
    validation_data=(X_es, y_es, w_es) if WEIGHTED_LOSS else (X_es, y_es),
    batch_size=batch_size,
    epochs=100,
    loss=loss,
    optimizer=optimizer,
    metrics=metrics,
    lr_scheduler=lr_scheduler,
    early_stopping_rounds=5,
)
save_dir = EXP_DIR / 'model' / f'mlp_{time.time()}'
model.save(save_dir)

2024-12-29 17:07:36.725139: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-12-29 17:07:36.726276: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-12-29 17:07:36.726460: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Training with early stopping patience 5
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 79)]              0         
                                                                 
 dense_0 (Dense)             (None, 512)               40960     
                                                                 
 activation (Activation)     (None, 512)               0         
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                                 
 activation_1 (Activation)   (None, 256)               0         
                                                                 
 dropout_1 (Dropout) 

2024-12-29 17:07:39.004676: I external/local_xla/xla/service/service.cc:168] XLA service 0x721c9968f4c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-12-29 17:07:39.004703: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce GTX 960, Compute Capability 5.2
2024-12-29 17:07:39.010713: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-12-29 17:07:39.025558: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
I0000 00:00:1735488459.077568  395850 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 41: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.
Fit complete after 41


In [6]:
from sklearn.metrics import r2_score
import tensorflow as tf

def evaluate_model(model, X_val, y_val, w_val):
    with tf.device('/CPU:0'):
        y_hat_val = model.predict(X_val, batch_size=1024*4)
    print(r2_score(y_true=y_val, y_pred=y_hat_val, sample_weight=w_val))

In [7]:
evaluate_model(model, X_train, y_train, w_train)
evaluate_model(model, X_es, y_es, w_es)
evaluate_model(model, X_val, y_val, w_val)


0.10680359601974487
-0.030304551124572754
-0.15921282768249512


In [8]:
import tensorflow as tf
X_val, y_val, w_val, _ = loader._build_splits(val_ds)
with tf.device('/CPU:0'):
    y_hat = model.predict(X_val, batch_size=batch_size)
y_val.shape, y_hat.shape




((366872,), (366872,))

In [9]:
from prj.metrics import weighted_mae, weighted_mse, weighted_r2, weighted_rmse

{
    'r2_w': weighted_r2(y_val, y_hat, weights=w_val),
    'mae_w': weighted_mae(y_val, y_hat, weights=w_val),
    'mse_w': weighted_mse(y_val, y_hat, weights=w_val),
    'rmse_w': weighted_rmse(y_val, y_hat, weights=w_val),
}

{'r2_w': -0.15912675857543945,
 'mae_w': 0.7232343,
 'mse_w': 1.1457345,
 'rmse_w': 1.0703899}