In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import altair as alt
import numpy as np
import polars as pl
import polars.selectors as cs
import torch
from lightning.fabric import Fabric
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm, trange

from src.datamodule import DataloaderConfig, DataModule, collate_fn
from src.dataset import LOBDataset, OrderFlowDataset
from src.models.deeplob import DeepLOBConfig
from src.models.deepvol import DeepVolConfig

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [3]:
ds = OrderFlowDataset("data/orderflow/train_memmap")

In [4]:
dl_config = DataloaderConfig(batch_size=32, eval_batch_size=64, shuffle=True)
dm = DataModule(
    dl_config, 
    window_size=100, 
    num_levels=10, 
    train_data_path="data/orderflow/train_memmap",
    val_data_path="data/orderflow/val_memmap",
    test_data_path="data/orderflow/test_memmap",
    data_repr="orderflow",
)
dm.setup()

[[36m2024-12-20 18:29:16,493[0m][[34mdata[0m][[32mINFO[0m] - Train dataset loaded: len(self.train_ds)=2799999[0m
[[36m2024-12-20 18:29:16,496[0m][[34mdata[0m][[32mINFO[0m] - Validation dataset loaded: len(self.val_ds)=250000[0m
[[36m2024-12-20 18:29:16,498[0m][[34mdata[0m][[32mINFO[0m] - Test dataset loaded: len(self.test_ds)=297666[0m


In [5]:
b = next(iter(dm.test_dataloader()))
b["X"].shape, b["y"].shape, b["idx"].shape

In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

  self.pid = os.fork()
In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

  self.pid = os.fork()


(torch.Size([64, 100, 20]), torch.Size([64, 1]), torch.Size([64]))

In [6]:
config = DeepVolConfig(num_targets=dm.num_targets, num_levels=dm.num_levels)
model = config.get_model()

In [7]:
x = b["X"]
print(x.shape)
x = x.unsqueeze(1)
print(x.shape)
x = model.conv2(x)
print(x.shape)
x = model.conv3(x)
print(x.shape)

torch.Size([64, 100, 20])
torch.Size([64, 1, 100, 20])
torch.Size([64, 32, 100, 10])
torch.Size([64, 32, 100, 1])


  return F.conv2d(


In [8]:
x_inp1 = model.inp1(x)
print(x_inp1.shape)
x_inp2 = model.inp2(x)
print(x_inp2.shape)
x_inp3 = model.inp3(x)
print(x_inp3.shape)

x = torch.cat((x_inp1, x_inp2, x_inp3), dim=1)
print(x.shape)

x = x.squeeze(-1).permute(0, 2, 1)
print(x.shape)

torch.Size([64, 64, 100, 1])
torch.Size([64, 64, 100, 1])
torch.Size([64, 64, 100, 1])
torch.Size([64, 192, 100, 1])
torch.Size([64, 100, 192])


In [9]:
x, _ = model.lstm(x)
print(x.shape)

x = x[:, -1, :]
print(x.shape)

torch.Size([64, 100, 64])
torch.Size([64, 64])


In [10]:
x = model.fc(x)
print(x.shape)

torch.Size([64, 7])


In [16]:
x[:, [0]].shape

torch.Size([64, 1])

In [15]:
torch.nn.functional.mse_loss(x[:, [0]], b["y"])

tensor(0.1183, grad_fn=<MseLossBackward0>)

In [28]:
a, b = torch.tensor([[1, 2, 3], [1, 2, 3]]).float(), torch.tensor([[1, 2, 4], [1, 2, 5]]).float()

In [None]:
err = (a - b)**2
err

In [None]:
torch.nn.functional.mse_loss(a, b)

In [None]:
err.mean()

In [None]:
(err / 2)

In [None]:
x.shape

In [76]:
# Read L2 data
df = pl.read_parquet("data/data.parquet")

# # Assuming that order reflects time
# df = df.with_row_index(name="time").with_columns(pl.col("time").cast(pl.Int64))

# Rescale prices by 1000 (so that they are around 1)
df = df.with_columns(cs.contains("Rate") / 1000, cs.contains("Size").log())

# Fill nulls with 0
df = df.with_columns((cs.contains("Rate") | cs.contains("Size")).fill_null(0))

# Midprice
df = df.with_columns(
    midprice=(pl.col("askRate0") + pl.col("bidRate0")) / 2,
    spread=pl.col("askRate0") - pl.col("bidRate0"),
)

# Reorder cols
cols = ["y", "midprice", "spread"]
for i in range(15):
    for side in ["ask", "bid"]:
        cols += [f"{side}Rate{i}", f"{side}Size{i}"]

df = df.select(cols)

In [None]:
(len(df) - 3_200_000) / len(df)

In [None]:
150_000 / len(df)

In [None]:
int(len(df) * 0.1)

In [None]:
import math


num_cycles = 10
num_training_steps = 50000 * 2
num_warmup_steps = 2000

def lr_lambda(current_step: int) -> float:
    if current_step < num_warmup_steps:
        return float(current_step) / float(max(1, num_warmup_steps))
    progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
    return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))

def get_cosine_with_hard_restarts_schedule_with_warmup_lr_lambda(current_step: int) -> float:
    if current_step < num_warmup_steps:
        return float(current_step) / float(max(1, num_warmup_steps))
    progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
    if progress >= 1.0:
        return 0.0
    return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))


lrs = [get_cosine_with_hard_restarts_schedule_with_warmup_lr_lambda(i) for i in range(num_training_steps)]

In [None]:
pl.DataFrame(lrs, schema=["lr"]).with_row_index(name="step").plot.line("step", "lr")

In [None]:
df

In [None]:
df.describe()

In [None]:
df.with_columns(cs.contains("Rate") - pl.col("midprice").shift(1))

In [None]:
exps = [
    pl.col(f"{side}Rate{i}") - pl.col(f"{side}Rate{i-1}")
    for i in range(1, 15)
    for side in ["ask", "bid"]
]
exps.append(cs.contains("Rate0") - pl.col("midprice"))

In [None]:
(
    df.with_columns(*exps)
    # .filter(pl.any_horizontal(cs.contains("bidRate") < -0.001))
)

In [None]:
path = "/home/pl487/time-series-prediction/data/predictions/deeplob-reg-15_2024-12-11T16-00-39_model_train_1_val.parquet"
df = pl.read_parquet(path)

In [None]:
df["pred"].describe()

In [None]:
import polars as pl
from sklearn.metrics import r2_score, mean_squared_error

path = "/home/pl487/time-series-prediction/outputs/model_train/deeplob-reg-15_2024-12-13T15-53-48/predictions/val_preds_test.tsv"
df = pl.read_csv(path, separator="\t")

In [None]:
df = (
    df
    .filter(pl.col("step") == pl.col("step").max())
)

In [None]:
r2_score(df["y"], df["preds"]), mean_squared_error(df["y"], df["preds"])

In [None]:
df.with_columns(
    r2=1 - ((pl.col("y") - pl.col("preds")).pow(2).sum() / (pl.col("y") - pl.col("y").mean()).pow(2).sum())
)["r2"].mean()

In [None]:
df.with_columns(diff=pl.col("pred") - pl.col("y"))["diff"].describe()

In [None]:
2 * 4 + 1

In [193]:
is_classification = False
window_size = 100
ds = LOBDataset("data/train_memmap.npy", is_classification=is_classification, use_prev_y=False, window_size=window_size)
dl = DataLoader(ds, batch_size=64, collate_fn=lambda x: collate_fn(x, max_len=window_size), shuffle=True)

In [None]:
ds[0]["X"].shape

In [None]:
batch = next(iter(dl))
x, y = batch["X"], batch["y"]
x.shape, y.shape

In [None]:
config = DeepLOBConfig(is_classification=is_classification)
model = config.get_model()
preds = model(x)

In [None]:
y.shape

In [None]:
preds

In [None]:
preds.shape

In [None]:
h = x.unsqueeze(1)
h.shape

In [None]:
h = model.conv1(h)
h = model.conv2(h)
h = model.conv3(h)
h.shape

In [None]:
model.inp1(h).shape, model.inp2(h).shape, model.inp3(h).shape

In [None]:
hc = torch.cat([model.inp1(h), model.inp2(h), model.inp3(h)], dim=1)
hc.shape

In [None]:
hc = hc.squeeze(-1)
hc.shape

In [None]:
hc = hc.permute(0, 2, 1)
hc.shape

In [166]:
hl, hh = model.lstm(hc)

In [None]:
hl.shape

In [None]:
hh

In [None]:
x = torch.tensor(
    [[1, 2, 3, 4], 
     [5, 6, 7, 8]], 
    dtype=torch.float32,
)
h = x.unsqueeze(0).unsqueeze(0)
x.shape, h.shape

In [None]:
conv1 = torch.nn.Conv2d(1, 2, kernel_size=(1, 2), stride=(1, 2), bias=True)
conv1.weight = torch.nn.Parameter(torch.ones_like(conv1.weight))
conv1.bias = torch.nn.Parameter(torch.zeros_like(conv1.bias) + 10.)
h1 = conv1(h)
h1.squeeze(0)

In [None]:
conv2 = torch.nn.Conv2d(2, 2, kernel_size=(4, 1), padding="same")
conv2.weight = torch.nn.Parameter(torch.ones_like(conv2.weight))
conv2.bias = torch.nn.Parameter(torch.zeros_like(conv2.bias) - 10.)
h2 = conv2(h1)
h2.squeeze(0)

In [13]:
h1 = model.conv1[0](x)
h2 = model.conv1[1](h1)

In [None]:
h1.shape, h2.shape

In [None]:
x = batch["X"]
 
# Add channel dimention
# x: batch_size, n_channels, seq_len, num_levels * 4
x = x.unsqueeze(1)

# Convolution blocks
# x: batch_size, n_channels, seq_len, num_levels * 2
x = model.conv1(x)
x = model.conv2(x)
x = model.conv3(x)

# Inception blocks
# batch_size, incep_out_channels, seq_len, 1
x_inp1 = model.inp1(x)
x_inp2 = model.inp2(x)
x_inp3 = model.inp3(x)

# Concatenate inception blocks
# batch_size, incep_out_channels * 3, seq_len, 1
x = torch.cat((x_inp1, x_inp2, x_inp3), dim=1)

x = x.squeeze(-1).permute(0, 2, 1)
x.shape

In [None]:
x, _ = model.pico(x)
x.shape

In [None]:
x = model.fc(x)
x.shape

In [None]:
x = x.squeeze(-1)
x.shape

In [None]:
x, y = batch["X"], batch["y"]
preds = model(x)

In [None]:
preds.shape

In [None]:
mask = ~(y <= -100)

In [None]:
mask.shape

In [None]:
error = (preds[mask] - y[mask])**2

In [None]:
error.mean()

In [None]:
list(zip(*{"a": [1, 2], "b": [3, 4]}.values()))

In [None]:
y = batch["y"]

torch.nn.functional.mse_loss(x, y)

In [None]:
from torchmetrics import MeanSquaredError

In [None]:
mse = MeanSquaredError()

In [None]:
mse(x, y)

In [None]:
head = torch.nn.Linear(192, 1, bias=False)

In [None]:
head(x).squeeze(-1)

In [None]:
config = DeepLOBConfig(is_classification=True)
model = config.get_model()

In [None]:
x, y = batch["X"], batch["y"]
preds = model.forward(x)
loss = torch.nn.functional.cross_entropy(preds, y)

In [None]:
loss

In [None]:
from torch._tensor import Tensor


def collate_fn(batch) -> tuple[Tensor, Tensor]:
    print(batch[0])
    # Find the maximum length in the batch
    max_len = max(item["X"].shape[0] for item in batch)
    
    # Pad each instance to the maximum length
    padded_batch = []
    for item in batch:
        X = item["X"]
        pad_size = max_len - X.shape[0]
        if pad_size > 0:
            X = torch.nn.functional.pad(X, (0, 0, pad_size, 0), "constant", 0)
        
        # pad the past such that the most recent observation is at the end
        padded_batch.append(X)
    
    # Stack the padded instances
    X_batch = torch.stack(padded_batch)
    y_batch = torch.stack([item["y"] for item in batch])
    
    return X_batch, y_batch

dl = DataLoader(dataset, batch_size=32, collate_fn=collate_fn, shuffle=False)
batch = next(iter(dl))

In [None]:
from src.models.deeplob import DeepLOB

model = DeepLOB()

In [None]:
X, y = batch

In [None]:
torch.nn.functional.mse_loss(model(X), y)

In [None]:
from src.module import RunningStage

In [None]:
f"{RunningStage.TRAIN}"

In [None]:
batch = [dataset[i] for i in range(3)]

In [None]:
max_len = max(item["X"].shape[0] for item in batch)

In [None]:
item = batch[1]["X"]

In [None]:
item.shape

In [None]:
pad_size = max_len - item.shape[0]

torch.nn.functional.pad(item, (0, 0, pad_size, 0), mode="constant", value=0.)[-2] == item[-2]

In [None]:
class LOBDataset(Dataset):
    def __init__(self, df, window_size: int = 100, use_prev_y: bool = False) -> None:
        super().__init__()
        self.df = df
        self.window_size = window_size
        self.use_prev_y = use_prev_y

    def __getitem__(self, index: int) -> dict:
        if index >= len(self.df):
            raise IndexError("Index out of bounds")
        data = self.df[max(0, index - self.window_size) : index + 1]
        y = data["y"].to_numpy()
        X = data.drop("y").to_numpy()
        out = {"X": X, "y": y[-1]}
        if self.use_prev_y:
            out["prev_y"] = y[:-1]
        return out

    def __len__(self) -> int:
        return len(self.df)

    @property
    def num_levels(self) -> int:
        return (self.df.width - 1) // 4


In [None]:
ds_dict["train"]["y"].value_counts().sort("y").plot.bar(x="y:N", y="count")

In [None]:
def continuous_to_class(y: float) -> int:
    return int((y + 5) / 0.5)


In [None]:
c = np.arange(-5, 5.25, 0.25)

In [None]:
((c + 5) / 0.25).astype(np.int32)

In [None]:
c = torch.arange(-5, 5.25, 0.25)
((c + 5) / 0.25).int() * 0.25 - 5

In [None]:
for k, v in ds_dict.items():
    v.write_parquet(f"data/{k}.parquet")

In [None]:
class LOBDataset(Dataset):
    def __init__(self, df, window_size: int = 100) -> None:
        super().__init__()
        self.df = df
        self.window_size = window_size
    
    def __getitem__(self, index: int) -> dict:
        if index >= len(self.df):
            raise IndexError("Index out of bounds")
        data = self.df[max(0, index - self.window_size) : index + 1]
        y = data["y"].to_numpy()
        X = data.drop("y").to_numpy()
        return {"X": X, "y": y[-1], "prev_y": y[:-1]}

    def __len__(self) -> int:
        return len(self.df)

    @property
    def num_levels(self) -> int:
        return (len(self.df.width) - 1) // 4

In [None]:
train_ds = LOBDataset(ds_dict["val"])

In [None]:
train_ds[0]["X"].shape

In [None]:
train_ds[8]["X"].shape

In [None]:
dl = DataLoader(train_ds, batch_size=2)

In [None]:
batch = next(iter(dl))

In [None]:
batch["prev_y"]

In [None]:
ds = LOBDataset(df)
dl = DataLoader(ds, batch_size=32, shuffle=True)

In [None]:
torch.set_float32_matmul_precision("high")

fabric = Fabric(accelerator="gpu")

model = DeepLOB(ds.num_levels)
# model.compile()

model = fabric.setup_module(model)
dl = fabric.setup_dataloaders(dl)

In [None]:
num_epochs = 10
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = torch.nn.MSELoss()

for epoch in trange(num_epochs, desc="Epoch"):
    model.train()
    total_loss = 0.0
    for batch in tqdm(dl, desc="Batch", leave=False):
        X, y = batch
        if X.shape[1] < ds.window_size:
            continue
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        fabric.backward(loss)
        optimizer.step()
        total_loss += loss.item()
    
    avg_loss = total_loss / len(dl)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

In [None]:
batch = df[:100]

In [None]:
model = DeepLOB(15)

In [None]:
X = (
    torch.tensor(batch.drop("y").to_numpy(), dtype=torch.float32)
    .unsqueeze(0)
    .repeat(2, 1, 1)
    .unsqueeze(1)
)
X.shape

In [None]:
x = model.conv1(X)  # batch_size, n_channels, seq_len, num_levels * 2
x = model.conv2(x)
x = model.conv3(x)

x_inp1 = model.inp1(x)
x_inp2 = model.inp2(x)
x_inp3 = model.inp3(x)

xc = torch.cat((x_inp1, x_inp2, x_inp3), dim=1)
# print(xc.shape)

In [None]:
x.shape, xc.shape

In [None]:
x1 = x.permute(0, 2, 1, 3)
x1 = x1.reshape((-1, x1.shape[1], x1.shape[2]))
x2 = x.squeeze(-1).permute(0, 2, 1)
x1.shape, x2.shape

In [None]:
x3, _ = model.lstm(x2)

In [None]:
x3.shape, x3[:, -1, :].shape

In [None]:
model.conv1(X).shape

In [None]:
window = 100
df = df.with_columns(
    askRate0_norm=(
        (pl.col("askRate0") - pl.col("askRate0").rolling_mean(window)) 
        / pl.col("askRate0").rolling_std(window)
    )
)

In [None]:
df.select(cs.contains("askRate0"))

In [None]:
int(0.7 * len(df)), int(0.2 * len(df))

In [None]:
train_df = df[: 2_500_000]
test_df = df[-700_000:]

In [None]:
train_df

In [None]:
class LOBDataset(Dataset):
    def __init__(self, df: pl.DataFrame, window_size: int = 100) -> None:
        super().__init__()
        self.df = df.select(cs.contains("Rate") | cs.contains("Size") | cs.contains("y"))
        self.window_size = window_size
    
    def __getitem__(self, index) -> torch.Tensor:
        data = self.df[max(0, index - self.window_size) : index + 1]
        y = data["y"].to_numpy()
        X = data.drop("y").to_numpy()
        return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)
    
    def __len__(self) -> int:
        return len(self.df)

    @property
    def num_levels(self) -> int:
        return (self.df.width - 1) // 4
        


In [None]:
ds = LOBDataset(df)

In [None]:
model = DeepLOB(ds.num_levels)

In [None]:
dl = DataLoader(ds, batch_size=32, shuffle=True)

In [None]:
batch = next(iter(dl))

In [None]:
X, y = batch

In [None]:
X

In [None]:
model(X.transpose(1, 2))