# Limit order book FI-2010 dataset

In [1]:
import torch
import torch.nn as nn
import yaml
import os
import h5py
import importlib
import numpy as np

import src.lib.lob_loader as lob_loader 
import src.lib.lob_train_utils as lob_utils
import src.lib.experimentation as exp
importlib.reload(lob_loader)
importlib.reload(lob_utils)
importlib.reload(exp)

from src.lib.plotting import get_config
from src.models.basic_grunet import GRUNetLOB
from src.preprocessing.static_transformations import StandardScalerTimeSeries, IgnoreTimeDecorator

In [13]:
cfg = get_config()

data_path = os.path.join(cfg['lob_dataset_directory'], 'lob.h5')

# train_loader, val_loader = get_wf_lob_loaders(
#     h5_path=data_path,
#     window=50,
#     split=2,
#     horizon=2,
#     batch_size=128,
#     class_resample=False,
#     normalization=None
# )

model_init_fn = lambda : GRUNetLOB(num_gru_layers=2)
optimizer_init_fn = lambda mod : torch.optim.RMSprop(mod.parameters(), lr=1e-4)
scheduler_init_fn = lambda opt : torch.optim.lr_scheduler.MultiStepLR(opt, milestones=[10, 8], gamma=0.1)
early_stopper_init_fn = lambda : exp.EarlyStopper(patience=5)
preprocess_init_fn = lambda : IgnoreTimeDecorator(StandardScalerTimeSeries(1), 15)

In [14]:
# to avoid errors with the data loaders creating too many file descriptors
torch.multiprocessing.set_sharing_strategy('file_system')

hist = exp.train_evaluate_lob_anchored(
    h5_file_path=data_path,
    model_init_fn=model_init_fn,
    preprocess_init_fn=preprocess_init_fn,
    optimizer_init_fn=optimizer_init_fn,
    scheduler_init_fn=scheduler_init_fn,
    early_stopper_init_fn=early_stopper_init_fn,
    num_epochs=20,
    device=torch.device('cuda', 3),
    random_state=42,
    horizon=2,
    windows=15,
    batch_size=128,
    use_resampling=True,
    splits=[6],
)

#### Evaluating model for split 6 ####
Fitting preprocesser to data for split 6


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/465 [00:00<?, ?it/s]

  0%|          | 0/465 [00:00<?, ?it/s]

  0%|          | 0/465 [00:00<?, ?it/s]

  0%|          | 0/465 [00:00<?, ?it/s]

  0%|          | 0/465 [00:00<?, ?it/s]

  0%|          | 0/465 [00:00<?, ?it/s]

  0%|          | 0/465 [00:00<?, ?it/s]

  0%|          | 0/465 [00:00<?, ?it/s]

  0%|          | 0/465 [00:00<?, ?it/s]

  0%|          | 0/465 [00:00<?, ?it/s]

  0%|          | 0/465 [00:00<?, ?it/s]

  0%|          | 0/465 [00:00<?, ?it/s]

  0%|          | 0/465 [00:00<?, ?it/s]

In [4]:
hist['split_results'][0]

[{'accuracy': 0.4592388632117219,
  'precision': array([0.28013253, 0.84462685, 0.32297308]),
  'recall': array([0.68474939, 0.36699785, 0.61833208]),
  'f1': array([0.39760426, 0.51167039, 0.42431429]),
  'precision_avg': 0.4825774884465827,
  'recall_avg': 0.556693107548209,
  'f1_avg': 0.4445296461204309,
  'kappa': 0.2131102536364241,
  'val_loss': 0.9947122031226951,
  'train_loss': 0.00045617291470989585},
 {'accuracy': 0.5299685318123709,
  'precision': array([0.38933361, 0.83833644, 0.30774194]),
  'recall': array([0.57227995, 0.47465153, 0.71675432]),
  'f1': array([0.46340407, 0.60612539, 0.43060257]),
  'precision_avg': 0.511803995411332,
  'recall_avg': 0.5878952659639326,
  'f1_avg': 0.5000440110120695,
  'kappa': 0.26701175083969764,
  'val_loss': 0.9325814736207123,
  'train_loss': 0.00041951355524361134},
 {'accuracy': 0.5082849837742157,
  'precision': array([0.41023872, 0.81452618, 0.26666283]),
  'recall': array([0.45430929, 0.47548859, 0.69676935]),
  'f1': array([0

In [27]:
num_batches = 0
X_train =  [] ;  y_train = []
for X, y in train_loader:
    X_train.append(X.numpy())
    y_train.append(y.numpy())
X_train = np.concatenate(X_train, axis=0)
y_train = np.concatenate(y_train, axis=0)
X_train.shape, y_train.shape

((124676, 50, 144), (124676, 1))