In [11]:
import pandas as pd
from scipy.stats import zscore
import pandas as pd
from sklearn.preprocessing import StandardScaler

# load packages
import pandas as pd
from typing import Optional
import pickle
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report

import torch
import torch.nn.functional as F
from torch.utils import data
from torchinfo import summary
import torch.nn as nn
import torch.optim as optim
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.33)
# N, D = X_train.shape
from datetime import date
import logging
from models.dataset import Dataset as Dataset

from models.utils import add_horizons, normalize

# Configure basic logging to a file
logging.basicConfig(
    filename='experiments.log',  # Name of the log file
    level=logging.INFO,             # Minimum logging level to capture (e.g., INFO, DEBUG, WARNING, ERROR, CRITICAL)
    format='%(asctime)s - %(levelname)s - %(message)s',  # Format of the log messages
    datefmt='%Y-%m-%d %H:%M:%S'      # Format for the timestamp
)

from models.gd import GradientDescent as GradientDescent

In [5]:
tickers=("slv","soxs","gdx")
model_shortname="universal"
#0-> 10, 1->50,2->100
horizon_to_predict=2
look_back_window=100
batch_size=64
alpha=0.01


logging.info(f"Running model with params Ticker: {tickers} Horizon Idx:{horizon_to_predict} window:{look_back_window} training batch size:{batch_size}")

In [6]:
data_path="data/etf/jan2025/cleaned/"
prefix="_cleaned_jan2025"

In [12]:
for ticker in tickers:
    dataset_train: Optional[Dataset] = None
    dataset_val: Optional[Dataset] = None
    dataset_test: Optional[Dataset] = None
    df = pd.read_csv(f"{data_path}{ticker}{prefix}.csv",engine="pyarrow",sep = ',')
    print(df.head())
    df["Date-Time"] = pd.to_datetime(df["Date-Time"])
    df["Date-Time"] = df["Date-Time"].dt.tz_convert("America/New_York")
    add_horizons(df,(10, 50, 100),alpha)
    class_summary = df.groupby("Target_100").size().reset_index(name="Count")
    # Calculate relative percentage
    class_summary["Percent"] = (class_summary["Count"] / class_summary["Count"].sum()) * 100
    class_summary["Percent"] = class_summary["Percent"].round(2)
    print(class_summary)
    normalize(df)
    df["Date"] = pd.to_datetime(df["Date-Time"]).dt.date
    df.groupby(["Date"]).size()
    df_train = df[(df["Date"] >= date(2025, 1, 3)) & (df["Date"] <= date(2025, 1, 10))]
    df_val = df[(df["Date"] >= date(2025, 1, 25)) & (df["Date"] <= date(2025, 1, 27))]
    df_test = df[(df["Date"] >= date(2025, 1, 28)) & (df["Date"] <= date(2025, 1, 31))]
    target_cols = [f"Target_{i}" for i in [10, 50, 100]]
    price_cols = [f"L{i}-BidPrice" for i in range(1, 11)] + [f"L{i}-AskPrice" for i in range(1, 11)]
    size_cols  = [f"L{i}-BidSize"  for i in range(1, 11)] + [f"L{i}-AskSize"  for i in range(1, 11)]
    df_train = df_train[price_cols + size_cols+target_cols]
    df_test =  df_test[price_cols + size_cols+target_cols]
    df_val = df_val[price_cols + size_cols+target_cols]
    if None in (dataset_train, dataset_val, dataset_test):
        dataset_train = Dataset(data=df_train.to_numpy(), k=horizon_to_predict, num_classes=3, T=look_back_window)
        dataset_val = Dataset(data=df_train.to_numpy(), k=horizon_to_predict, num_classes=3, T=look_back_window)
        dataset_test = Dataset(data=df_train.to_numpy(), k=horizon_to_predict, num_classes=3, T=look_back_window)
    else:
        dataset_train1 = Dataset(data=df_train.to_numpy(), k=horizon_to_predict, num_classes=3, T=look_back_window)
        dataset_val1 = Dataset(data=df_train.to_numpy(), k=horizon_to_predict, num_classes=3, T=look_back_window)
        dataset_test1 = Dataset(data=df_train.to_numpy(), k=horizon_to_predict, num_classes=3, T=look_back_window)
        dataset_train.merge(dataset_train1)
        dataset_val.merge(dataset_val1)
        dataset_test.merge(dataset_test1)

  #RIC                           Date-Time  rel.spread  abs.spread  mid_price  \
0  SLV 2025-01-03 14:40:00.028091268+00:00    0.000735        0.02     27.210   
1  SLV 2025-01-03 14:40:00.375169630+00:00    0.000367        0.01     27.215   
2  SLV 2025-01-03 14:40:05.023122125+00:00    0.000735        0.02     27.220   
3  SLV 2025-01-03 14:40:05.447334991+00:00    0.000367        0.01     27.215   
4  SLV 2025-01-03 14:40:07.162635990+00:00    0.000735        0.02     27.220   

   L1-BidPrice  L1-BidSize  L1-AskPrice  L1-AskSize  L2-BidPrice  ...  \
0        27.20      9100.0        27.22      4100.0        27.19  ...   
1        27.21      2700.0        27.22     11200.0        27.20  ...   
2        27.21     20300.0        27.23     25500.0        27.20  ...   
3        27.21     18000.0        27.22      2600.0        27.20  ...   
4        27.21     23900.0        27.23     25500.0        27.20  ...   

   L8-AskPrice  L8-AskSize  L9-BidPrice  L9-BidSize  L9-AskPrice  L9-AskSi

In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [14]:
train_loader = torch.utils.data.DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset=dataset_val, batch_size=batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(dataset=dataset_test, batch_size=batch_size, shuffle=False)

print(dataset_train.x.shape, dataset_train.y.shape)

torch.Size([93860, 1, 100, 40]) torch.Size([93860])


In [16]:
from models.deeplob import deeplob as deeplob
model = deeplob(device=device,y_len = dataset_train.num_classes)
model.to(device)
model_savepoint=f"best_val_model_{model_shortname}_deeplob.pt"
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
from models.gd import GradientDescent as GradientDescent
train_losses, val_losses = GradientDescent(device).batch(model, criterion, optimizer,
                                    train_loader, val_loader,model_savepoint, epochs=50)
all_targets, all_predictions = GradientDescent(device).evaulate_model(model_savepoint, model, test_loader)
plt.figure(figsize=(15,6))
plt.plot(train_losses, label='train loss')
plt.plot(val_losses, label='validation loss')
plt.legend()
print('accuracy_score:', accuracy_score(all_targets, all_predictions))
print(classification_report(all_targets, all_predictions, digits=4))

In [24]:
from models.cnn1 import CNN1

model = CNN1(num_classes = dataset_train.num_classes)
model.to(device)
model_savepoint=f"best_val_model_{model_shortname}_cnn1.pt"
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
train_losses, val_losses = GradientDescent(device).batch(model, criterion, optimizer,
                                    train_loader, val_loader,model_savepoint, epochs=50)
all_targets, all_predictions = GradientDescent(device).evaulate_model(model_savepoint, model, test_loader)
plt.figure(figsize=(15,6))
plt.plot(train_losses, label='train loss')
plt.plot(val_losses, label='validation loss')
plt.legend()
print('accuracy_score:', accuracy_score(all_targets, all_predictions))
print(classification_report(all_targets, all_predictions, digits=4))

  2%|▏         | 1/50 [00:05<04:33,  5.58s/it]

model saved
Epoch 1/50, Train Loss: 1.0768,               Validation Loss: 1.0638, Duration: 0:00:05.576221, Best Val Epoch: 0


  4%|▍         | 2/50 [00:11<04:28,  5.60s/it]

model saved
Epoch 2/50, Train Loss: 1.0395,               Validation Loss: 1.0104, Duration: 0:00:05.613930, Best Val Epoch: 1


  6%|▌         | 3/50 [00:16<04:22,  5.59s/it]

model saved
Epoch 3/50, Train Loss: 0.9854,               Validation Loss: 0.9560, Duration: 0:00:05.571208, Best Val Epoch: 2


  8%|▊         | 4/50 [00:22<04:17,  5.61s/it]

model saved
Epoch 4/50, Train Loss: 0.9330,               Validation Loss: 0.9070, Duration: 0:00:05.639514, Best Val Epoch: 3


 10%|█         | 5/50 [00:28<04:14,  5.65s/it]

model saved
Epoch 5/50, Train Loss: 0.8887,               Validation Loss: 0.8611, Duration: 0:00:05.719788, Best Val Epoch: 4


 12%|█▏        | 6/50 [00:33<04:05,  5.58s/it]

model saved
Epoch 6/50, Train Loss: 0.8507,               Validation Loss: 0.8211, Duration: 0:00:05.459533, Best Val Epoch: 5


 14%|█▍        | 7/50 [00:39<04:01,  5.61s/it]

model saved
Epoch 7/50, Train Loss: 0.8185,               Validation Loss: 0.8004, Duration: 0:00:05.652353, Best Val Epoch: 6


 16%|█▌        | 8/50 [00:45<03:58,  5.68s/it]

model saved
Epoch 8/50, Train Loss: 0.7927,               Validation Loss: 0.7689, Duration: 0:00:05.823491, Best Val Epoch: 7


 18%|█▊        | 9/50 [00:50<03:53,  5.69s/it]

model saved
Epoch 9/50, Train Loss: 0.7672,               Validation Loss: 0.7447, Duration: 0:00:05.733298, Best Val Epoch: 8


 20%|██        | 10/50 [00:56<03:48,  5.72s/it]

model saved
Epoch 10/50, Train Loss: 0.7440,               Validation Loss: 0.7228, Duration: 0:00:05.779544, Best Val Epoch: 9


 22%|██▏       | 11/50 [01:02<03:43,  5.73s/it]

model saved
Epoch 11/50, Train Loss: 0.7234,               Validation Loss: 0.7165, Duration: 0:00:05.736615, Best Val Epoch: 10


 24%|██▍       | 12/50 [01:06<03:23,  5.36s/it]

model saved
Epoch 12/50, Train Loss: 0.7054,               Validation Loss: 0.6886, Duration: 0:00:04.522283, Best Val Epoch: 11


 26%|██▌       | 13/50 [01:12<03:20,  5.41s/it]

model saved
Epoch 13/50, Train Loss: 0.6897,               Validation Loss: 0.6733, Duration: 0:00:05.517109, Best Val Epoch: 12


 28%|██▊       | 14/50 [01:17<03:13,  5.37s/it]

model saved
Epoch 14/50, Train Loss: 0.6724,               Validation Loss: 0.6605, Duration: 0:00:05.290097, Best Val Epoch: 13


 30%|███       | 15/50 [01:23<03:10,  5.44s/it]

Epoch 15/50, Train Loss: 0.6603,               Validation Loss: 0.6827, Duration: 0:00:05.590203, Best Val Epoch: 13


 32%|███▏      | 16/50 [01:28<03:07,  5.51s/it]

model saved
Epoch 16/50, Train Loss: 0.6445,               Validation Loss: 0.6338, Duration: 0:00:05.687294, Best Val Epoch: 15


 34%|███▍      | 17/50 [01:34<03:03,  5.57s/it]

model saved
Epoch 17/50, Train Loss: 0.6307,               Validation Loss: 0.6257, Duration: 0:00:05.707886, Best Val Epoch: 16


 36%|███▌      | 18/50 [01:40<02:56,  5.52s/it]

model saved
Epoch 18/50, Train Loss: 0.6220,               Validation Loss: 0.6088, Duration: 0:00:05.399578, Best Val Epoch: 17


 38%|███▊      | 19/50 [01:45<02:53,  5.59s/it]

model saved
Epoch 19/50, Train Loss: 0.6091,               Validation Loss: 0.5913, Duration: 0:00:05.737450, Best Val Epoch: 18


 40%|████      | 20/50 [01:51<02:48,  5.62s/it]

Epoch 20/50, Train Loss: 0.6023,               Validation Loss: 0.6631, Duration: 0:00:05.695923, Best Val Epoch: 18


 42%|████▏     | 21/50 [01:57<02:43,  5.64s/it]

Epoch 21/50, Train Loss: 0.5937,               Validation Loss: 0.6019, Duration: 0:00:05.701265, Best Val Epoch: 18


 44%|████▍     | 22/50 [02:02<02:37,  5.64s/it]

model saved
Epoch 22/50, Train Loss: 0.5852,               Validation Loss: 0.5867, Duration: 0:00:05.617222, Best Val Epoch: 21


 46%|████▌     | 23/50 [02:08<02:32,  5.67s/it]

Epoch 23/50, Train Loss: 0.5739,               Validation Loss: 0.6571, Duration: 0:00:05.736948, Best Val Epoch: 21


 48%|████▊     | 24/50 [02:13<02:24,  5.56s/it]

model saved
Epoch 24/50, Train Loss: 0.5703,               Validation Loss: 0.5737, Duration: 0:00:05.315325, Best Val Epoch: 23


 50%|█████     | 25/50 [02:19<02:19,  5.60s/it]

model saved
Epoch 25/50, Train Loss: 0.5642,               Validation Loss: 0.5490, Duration: 0:00:05.686898, Best Val Epoch: 24


 52%|█████▏    | 26/50 [02:25<02:14,  5.61s/it]

model saved
Epoch 26/50, Train Loss: 0.5556,               Validation Loss: 0.5349, Duration: 0:00:05.643040, Best Val Epoch: 25


 54%|█████▍    | 27/50 [02:30<02:09,  5.62s/it]

model saved
Epoch 27/50, Train Loss: 0.5504,               Validation Loss: 0.5348, Duration: 0:00:05.646234, Best Val Epoch: 26


 56%|█████▌    | 28/50 [02:36<02:04,  5.66s/it]

model saved
Epoch 28/50, Train Loss: 0.5473,               Validation Loss: 0.5244, Duration: 0:00:05.759955, Best Val Epoch: 27


 58%|█████▊    | 29/50 [02:42<01:58,  5.65s/it]

Epoch 29/50, Train Loss: 0.5382,               Validation Loss: 0.5718, Duration: 0:00:05.625006, Best Val Epoch: 27


 60%|██████    | 30/50 [02:47<01:52,  5.63s/it]

model saved
Epoch 30/50, Train Loss: 0.5345,               Validation Loss: 0.5170, Duration: 0:00:05.575068, Best Val Epoch: 29


 62%|██████▏   | 31/50 [02:52<01:44,  5.49s/it]

model saved
Epoch 31/50, Train Loss: 0.5286,               Validation Loss: 0.5145, Duration: 0:00:05.147985, Best Val Epoch: 30


 64%|██████▍   | 32/50 [02:58<01:39,  5.54s/it]

model saved
Epoch 32/50, Train Loss: 0.5260,               Validation Loss: 0.5085, Duration: 0:00:05.671766, Best Val Epoch: 31


 66%|██████▌   | 33/50 [03:04<01:34,  5.57s/it]

Epoch 33/50, Train Loss: 0.5186,               Validation Loss: 0.5272, Duration: 0:00:05.631705, Best Val Epoch: 31


 68%|██████▊   | 34/50 [03:09<01:29,  5.60s/it]

Epoch 34/50, Train Loss: 0.5160,               Validation Loss: 0.5136, Duration: 0:00:05.680129, Best Val Epoch: 31


 70%|███████   | 35/50 [03:15<01:25,  5.70s/it]

Epoch 35/50, Train Loss: 0.5112,               Validation Loss: 0.5992, Duration: 0:00:05.919278, Best Val Epoch: 31


 72%|███████▏  | 36/50 [03:20<01:14,  5.30s/it]

model saved
Epoch 36/50, Train Loss: 0.5068,               Validation Loss: 0.4857, Duration: 0:00:04.385822, Best Val Epoch: 35


 74%|███████▍  | 37/50 [03:25<01:09,  5.37s/it]

model saved
Epoch 37/50, Train Loss: 0.5036,               Validation Loss: 0.4827, Duration: 0:00:05.530300, Best Val Epoch: 36


 76%|███████▌  | 38/50 [03:31<01:05,  5.49s/it]

model saved
Epoch 38/50, Train Loss: 0.4994,               Validation Loss: 0.4814, Duration: 0:00:05.768302, Best Val Epoch: 37


 78%|███████▊  | 39/50 [03:37<01:01,  5.56s/it]

Epoch 39/50, Train Loss: 0.4964,               Validation Loss: 0.4863, Duration: 0:00:05.717952, Best Val Epoch: 37


 80%|████████  | 40/50 [03:43<00:56,  5.62s/it]

model saved
Epoch 40/50, Train Loss: 0.4909,               Validation Loss: 0.4810, Duration: 0:00:05.774018, Best Val Epoch: 39


 82%|████████▏ | 41/50 [03:48<00:50,  5.65s/it]

Epoch 41/50, Train Loss: 0.4894,               Validation Loss: 0.5148, Duration: 0:00:05.715755, Best Val Epoch: 39


 84%|████████▍ | 42/50 [03:53<00:42,  5.32s/it]

model saved
Epoch 42/50, Train Loss: 0.4844,               Validation Loss: 0.4764, Duration: 0:00:04.560185, Best Val Epoch: 41


 86%|████████▌ | 43/50 [03:58<00:37,  5.42s/it]

Epoch 43/50, Train Loss: 0.4830,               Validation Loss: 0.4799, Duration: 0:00:05.632245, Best Val Epoch: 41


 88%|████████▊ | 44/50 [04:04<00:33,  5.51s/it]

model saved
Epoch 44/50, Train Loss: 0.4796,               Validation Loss: 0.4612, Duration: 0:00:05.732331, Best Val Epoch: 43


 90%|█████████ | 45/50 [04:10<00:27,  5.58s/it]

Epoch 45/50, Train Loss: 0.4752,               Validation Loss: 0.4733, Duration: 0:00:05.750441, Best Val Epoch: 43


 92%|█████████▏| 46/50 [04:15<00:21,  5.50s/it]

model saved
Epoch 46/50, Train Loss: 0.4713,               Validation Loss: 0.4529, Duration: 0:00:05.300089, Best Val Epoch: 45


 94%|█████████▍| 47/50 [04:21<00:16,  5.57s/it]

Epoch 47/50, Train Loss: 0.4671,               Validation Loss: 0.4637, Duration: 0:00:05.726897, Best Val Epoch: 45


 96%|█████████▌| 48/50 [04:26<00:11,  5.56s/it]

model saved
Epoch 48/50, Train Loss: 0.4658,               Validation Loss: 0.4524, Duration: 0:00:05.527299, Best Val Epoch: 47


 98%|█████████▊| 49/50 [04:32<00:05,  5.59s/it]

Epoch 49/50, Train Loss: 0.4640,               Validation Loss: 0.4804, Duration: 0:00:05.682729, Best Val Epoch: 47


100%|██████████| 50/50 [04:38<00:00,  5.57s/it]

model saved
Epoch 50/50, Train Loss: 0.4596,               Validation Loss: 0.4479, Duration: 0:00:05.719579, Best Val Epoch: 49





In [None]:
from models.mlp import MLP

model = MLP()
model.to(device)
model_savepoint=f"best_val_model_{model_shortname}_mlp.pt"
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
train_losses, val_losses = GradientDescent(device).batch(model, criterion, optimizer,
                                    train_loader, val_loader,model_savepoint, epochs=50)
all_targets, all_predictions = GradientDescent(device).evaulate_model(model_savepoint, model, test_loader)
plt.figure(figsize=(15,6))
plt.plot(train_losses, label='train loss')
plt.plot(val_losses, label='validation loss')
plt.legend()
print('accuracy_score:', accuracy_score(all_targets, all_predictions))
print(classification_report(all_targets, all_predictions, digits=4))