In [1]:
import sys, importlib
import torch 
import torch.nn as nn
import numpy as np
import pandas as pd
import copy
import time
from collections import OrderedDict

sys.path.append("../")
from proj_mod import training, data_processing, visualization
importlib.reload(training);
importlib.reload(data_processing);
importlib.reload(visualization);

device=(torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))
print(f"Using device {device}")

Using device cuda


# Functions to optimize hyperparameters

In [None]:
import optuna
import torch.optim as optim

def train_one_epoch(model, dataloader, optimizer, device, eps=0):
    model.train()
    loss_fn = training.RMSPELoss(eps=eps) 
    total_data_count=len(dataloader.dataset)
    sum_of_squares=0.0
    for batch in dataloader:
        x, y = batch
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        pred = model(x)
        loss = loss_fn(pred, y)
        loss.backward()
        optimizer.step()
        with torch.no_grad():
            sum_of_squares+=torch.sum(torch.square((pred-y)/(y+eps)))
    with torch.no_grad():
        rmspe=torch.sqrt(sum_of_squares/total_data_count)
    return rmspe.item()

def validate(model, dataloader, device, eps=0):
    model.eval()
    sum_of_squares = 0.0
    total_data_count=len(dataloader.dataset)
    with torch.no_grad():
        for batch in dataloader:
            x, y = batch
            x, y = x.to(device), y.to(device)
            pred = model(x)
            sum_of_squares += torch.sum(torch.square((pred - y) / (y + eps)))
        rmspe = torch.sqrt(sum_of_squares / total_data_count)
    return rmspe.item()

def objective(trial, define_model): 

    lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)

    model = define_model(trial).to(device)

    optimizer = optim.AdamW(model.parameters(), lr=lr)
    scheduler=optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,mode="min",factor=0.5,patience=5,min_lr=1e-7)

    eps = 1e-5
    best_val_loss = float('inf')

    patience = 3
    patience_counter = 0
    max_epochs = 10

    
    for epoch in range(max_epochs):
        train_loss = train_one_epoch(model, train_loader, optimizer, device, eps)
        val_loss = validate(model, test_loader, device, eps)

        scheduler.step(val_loss)
        
        trial.report(val_loss, epoch)

        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

        if val_loss < best_val_loss - 1e-4:  # added minimum delta threshold
            best_val_loss = val_loss
            patience_counter = 0          # reset counter on improvement
        else:
            patience_counter += 1         # increment if no improvement

            # Early stopping
            if patience_counter >= patience:
                break  
    
    return best_val_loss

# Usage examples

## Needed by both models

### Data and splitting

In [3]:
list_time=np.load("../processed_data/recovered_time_id_order.npy")
df_RV_ts=pd.read_parquet("../processed_data/book_RV_ts_60_si.parquet")

df_target=pd.read_csv("../raw_data/kaggle_ORVP/train.csv")
df_target["row_id"]=df_target["stock_id"].astype(int).astype(str)+"-"+df_target["time_id"].astype(int).astype(str)

time_split_list=data_processing.time_cross_val_split(list_time=list_time,n_split=1,percent_val_size=10,list_output=True)
train_time_id,test_time_id=time_split_list[0][0],time_split_list[0][1]

In fold 0 :

Train set end at 8117 .

Test set start at 15516 end at 10890 .



### Modules

In [4]:
ts_emb_dim=32
n_diff=2
ts_dim=n_diff+1

pos_embedder=training.pos_emb_cross_attn(length=60,ts_dim=ts_dim,emb_dim=ts_emb_dim,dropout=0.2,num_heads=4,keep_mag=True).to(device=device)

ts_encoder_ff_layer=[
    nn.Linear(in_features=ts_emb_dim,out_features=64),
    nn.ReLU(),
    nn.Linear(in_features=64,out_features=ts_emb_dim)
]

ts_decoder_ff_layer=[
    nn.Linear(in_features=ts_emb_dim,out_features=64),
    nn.ReLU(),
    nn.Linear(in_features=64,out_features=ts_emb_dim)
]

output_ff=nn.Sequential(
    nn.Linear(in_features=ts_emb_dim,out_features=1)
).to(device=device)

## Encoder-decoder model without id

### Dataset creation

In [None]:
train_dataset=training.RVdataset(time_id_list=train_time_id,ts_features=["sub_int_RV"],tab_features=["emb_id"],df_ts_feat=df_RV_ts,df_target=df_target)
test_dataset=training.RVdataset(time_id_list=test_time_id,ts_features=["sub_int_RV"],tab_features=["emb_id"],df_ts_feat=df_RV_ts,df_target=df_target)

train_loader=torch.utils.data.DataLoader(dataset=train_dataset,batch_size=512,shuffle=True, num_workers=4, pin_memory=True)
test_loader=torch.utils.data.DataLoader(dataset=test_dataset,batch_size=512,shuffle=True, num_workers =4, pin_memory=True)

### Model definition and optimization of hyperparameters

In [None]:
def define_model_encdec(trial):
    encoder_layers = trial.suggest_int("encoder_layer_num", 2, 6)
    decoder_layers = trial.suggest_int("decoder_layer_num", 2, 6)
    dropout = trial.suggest_float("dropout", 0.0, 0.5, step=0.1)   

    return training.encoder_decoder_teacherforcing(
        pos_emb_model=pos_embedder,
        output_feedforward=output_ff,
        encoder_dropout=dropout,
        decoder_dropout=dropout,
        encoder_feedforward_list=ts_encoder_ff_layer,
        decoder_feedforward_list=ts_decoder_ff_layer,
        n_diff=n_diff,
        encoder_layer_num=encoder_layers,
        decoder_layer_num=decoder_layers,
        input_scaler=10000,
        ts_emb_dim=ts_emb_dim,
        encoder_num_heads=4,
        decoder_num_heads=4,
        encoder_keep_mag=True,
        decoder_keep_mag=True,
        return_sum=True
    )

study = optuna.create_study(direction="minimize")
study.optimize(lambda trial: objective(trial, define_model_encdec), n_trials=20)

print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

I interrupted the execution, but these are the results for the first trials:

[I 2025-08-05 10:54:39,349] A new study created in memory with name: no-name-2029d593-a46b-4a1e-8988-a32d747af686
[I 2025-08-05 10:57:38,321] Trial 0 finished with value: 0.23577575385570526 and parameters: {'lr': 0.00868445237867199, 'encoder_layer_num': 3, 'decoder_layer_num': 4, 'dropout': 0.1}. Best is trial 0 with value: 0.23577575385570526.
[I 2025-08-05 11:01:05,833] Trial 1 finished with value: 0.22967329621315002 and parameters: {'lr': 0.0006277516484037366, 'encoder_layer_num': 2, 'decoder_layer_num': 6, 'dropout': 0.0}. Best is trial 1 with value: 0.22967329621315002.
[I 2025-08-05 11:04:44,128] Trial 2 finished with value: 0.22987382113933563 and parameters: {'lr': 0.0009455489769112315, 'encoder_layer_num': 2, 'decoder_layer_num': 6, 'dropout': 0.30000000000000004}. Best is trial 1 with value: 0.22967329621315002.
[I 2025-08-05 11:11:30,216] Trial 3 finished with value: 0.22987306118011475 and parameters: {'lr': 0.00015086803239011521, 'encoder_layer_num': 6, 'decoder_layer_num': 5, 'dropout': 0.0}. Best is trial 1 with value: 0.22967329621315002.
[I 2025-08-05 11:15:14,044] Trial 4 finished with value: 0.22976943850517273 and parameters: {'lr': 0.00010708282792185345, 'encoder_layer_num': 4, 'decoder_layer_num': 3, 'dropout': 0.1}. Best is trial 1 with value: 0.22967329621315002.
[W 2025-08-05 11:15:53,732] Trial 5 failed with parameters: {'lr': 9.694640858927792e-05, 'encoder_layer_num': 3, 'decoder_layer_num': 3, 'dropout': 0.0} because of the following error: KeyboardInterrupt().

## Encoder-decoder model with id

### Data and dataset creation

In [5]:
RV_tab=pd.read_csv("../processed_data/RV_by_row_id.csv")
RV_tab["stock_id"]=RV_tab["row_id"].apply(lambda x: x.split("-")[0])
RV_tab["time_id"]=RV_tab["row_id"].apply(lambda x: x.split("-")[1])

# Creates tabular data, most specifically 'emb_id'
unique_ids = sorted(RV_tab['stock_id'].unique())
id_to_emb = {stock_id: i for i, stock_id in enumerate(unique_ids)}
RV_tab['emb_id'] = RV_tab['stock_id'].map(id_to_emb)

train_dataset=training.RVdataset(time_id_list=train_time_id,ts_features=["sub_int_RV"],tab_features=["emb_id"],df_ts_feat=df_RV_ts,df_tab_feat=RV_tab,df_target=df_target)
test_dataset=training.RVdataset(time_id_list=test_time_id,ts_features=["sub_int_RV"],tab_features=["emb_id"],df_ts_feat=df_RV_ts,df_tab_feat=RV_tab,df_target=df_target)

ts_place, id_place=train_dataset.featureplace["sub_int_RV"], train_dataset.featureplace["emb_id"]

train_loader=torch.utils.data.DataLoader(dataset=train_dataset,batch_size=512,shuffle=True, num_workers=4, pin_memory=True)
test_loader=torch.utils.data.DataLoader(dataset=test_dataset,batch_size=512,shuffle=True, num_workers =4, pin_memory=True)

### Model definition and optimization of hyperparameters

In [None]:

def define_model_encdec_id(trial):
    encoder_layers = trial.suggest_int("encoder_layer_num", 2, 6)
    decoder_layers = trial.suggest_int("decoder_layer_num", 2, 6)
    dropout = trial.suggest_float("dropout", 0.0, 0.5, step=0.1)   

    base_model =training.encoder_decoder_teacherforcing(
        pos_emb_model=pos_embedder,
        output_feedforward=output_ff,
        encoder_dropout=dropout,
        decoder_dropout=dropout,
        encoder_feedforward_list=ts_encoder_ff_layer,
        decoder_feedforward_list=ts_decoder_ff_layer,
        n_diff=n_diff,
        encoder_layer_num=encoder_layers,
        decoder_layer_num=decoder_layers, 
        input_scaler=10000,
        ts_emb_dim=ts_emb_dim,
        encoder_num_heads=4,
        decoder_num_heads=4,
        encoder_keep_mag=True,
        decoder_keep_mag=True,
        return_sum=True
    )

    id_emb_dim=8
    id_hidden_dict=OrderedDict([("linear1", nn.Linear(in_features=id_emb_dim, out_features=32),),
                                ("tanh1", nn.Tanh()),
                                ("linear2", nn.Linear(in_features=32, out_features=16)),
                                ("tanh2", nn.Tanh()),
                                ("linear3", nn.Linear(in_features=16, out_features=8)),
                                ("tanh3", nn.Tanh()),
                                ("linear4", nn.Linear(in_features=8,out_features=1))])
    id_hidden_layers=nn.Sequential(id_hidden_dict).to(device=device)

    return training.id_learned_embedding_adj_rnn_mtpl(ts_place=ts_place,
                                             id_place=id_place, 
                                             rnn_model=base_model,
                                             id_hidden_model=id_hidden_layers,
                                             id_input_num=112,
                                             emb_dim=id_emb_dim)

study2 = optuna.create_study(direction="minimize")
study2.optimize(lambda trial: objective(trial, define_model_encdec_id), n_trials=20)
    