In [68]:
import polars as pl
import numpy as np
from tqdm import tqdm
from prj.oamp.oamp import OAMP
from prj.oamp.oamp_config import ConfigOAMP
import os, sys, gc
import pickle
import numpy as np
import pandas as pd
import polars as pl

import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorch_lightning import (LightningDataModule, LightningModule, Trainer)

from sklearn.metrics import r2_score
import kaggle_evaluation.jane_street_inference_server
import torch.optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import math
from tqdm import tqdm
from collections import OrderedDict
import warnings
import joblib
from pytorch_lightning.callbacks import Callback
import gc

import lightgbm as lgb
from lightgbm import LGBMRegressor, Booster
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [69]:
from prj.config import DATA_DIR
from prj.data.data_loader import PARTITIONS_DATE_INFO, DataConfig, DataLoader

data_args = data_args = {'include_time_id': True, 'include_intrastock_norm_temporal': True}
config = DataConfig(**data_args)
loader = DataLoader(data_dir=DATA_DIR, config=config)

start, end = 1360, 1360 + 30*2
test_ds = loader.load(start, end).sort('date_id', 'time_id', 'symbol_id')
features = loader.features
X, y, w, info = loader._build_splits(test_ds)

X.shape

100%|██████████| 68/68 [00:03<00:00, 17.39it/s]


Skipping 1414-1420
Skipping 1415-1420
Skipping 1416-1420
Skipping 1417-1420
Skipping 1418-1420
Skipping 1419-1420
Skipping 1420-1420


(2187680, 134)

In [70]:
start_test_idx = test_ds.with_row_index().filter(pl.col('date_id').lt(1360+30)).collect().shape[0]
start_test_idx

1091904

# LGBM Agents

In [71]:
import lightgbm as lgb
from lleaves import Model
from pathlib import Path

lgbm_model_files = [
    "/home/lorecampa/projects/jane_street_forecasting/dataset/models/lgbm/lgbm_maxbin_63_0_7_324272949.txt",
    "/home/lorecampa/projects/jane_street_forecasting/dataset/models/lgbm/lgbm_maxbin_63_0_7_3234493111.txt"
]
lgbm_agents = [
    Model(model_file=file) for file in lgbm_model_files
]
for i in tqdm(range(len(lgbm_agents)), desc='Compiling lgbm models'):
    lgbm_agents[i].compile(cache=Path(lgbm_model_files[i]).with_suffix('.o'))


lgbm_agents_label = ['lgbm_1', 'lgbm_2']

Compiling lgbm models: 100%|██████████| 2/2 [00:00<00:00, 14.99it/s]


In [72]:
lgbm_predictions = np.concatenate([agent.predict(X).reshape(-1, 1) for agent in tqdm(lgbm_agents)], axis=1)
lgbm_predictions.shape

100%|██████████| 2/2 [00:07<00:00,  3.86s/it]


(2187680, 2)

# Graph Conv

In [73]:
import numpy as np
import polars as pl
from pathlib import Path
import gc
import os
from typing import List, Union, Dict, Any

import torch
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from torch_geometric.nn import GCNConv

def get_device():
    if torch.cuda.is_available():
        return 'cuda'
    return 'cpu'

class JaneStreetMultiStockGraphDataset(Dataset):
    
    def __init__(self, dataset: pl.LazyFrame, adjacency_matrices: np.ndarray, num_stocks: int = 39):
        self.dataset = dataset
        self.adjacency_matrices = adjacency_matrices
        self.num_stocks = num_stocks
        self.dataset_len = self.dataset.select(['date_id', 'time_id']).unique().collect().shape[0]
        self._load()
    
    def _load(self):
        all_combinations = (
            self.dataset.select(['date_id', 'time_id'])
            .unique()
            .join(pl.DataFrame({'symbol_id': list(range(self.num_stocks))}, 
                               schema={'symbol_id': pl.Int8}).lazy(), how="cross")
        )
        feature_cols = [f'feature_{i:02d}' for i in range(79)]
        self.batch = (
            all_combinations
            .join(self.dataset.with_columns(pl.lit(1).alias('mask')), 
                  on=['date_id', 'time_id', 'symbol_id'], how="left")
            .fill_null(0)  # fill all columns with 0 for missing stocks (including the mask)
            .sort(['date_id', 'time_id', 'symbol_id'])
        )
        # num_stocks rows for each date and time
        self.X = self.batch.select(feature_cols).collect().to_numpy().astype(np.float32)
        self.y = self.batch.select(['responder_6']).collect().to_numpy().flatten().astype(np.float32)
        self.s = self.batch.select(['symbol_id']).collect().to_numpy().flatten().astype(np.int32)
        self.date_ids = self.batch.select(['date_id']).collect().to_numpy().flatten()
        self.masks = self.batch.select(['mask']).collect().to_numpy().flatten() == 0
        self.weights = self.batch.select(['weight']).collect().to_numpy().flatten().astype(np.float32)
    
    def __len__(self):
        return self.dataset_len
    
    def __getitem__(self, idx):
        start_row = idx * self.num_stocks
        features = self.X[start_row:start_row+self.num_stocks, :]
        targets = self.y[start_row:start_row+self.num_stocks]
        masks = self.masks[start_row:start_row+self.num_stocks]
        weights = self.weights[start_row:start_row+self.num_stocks]
        symbols = self.s[start_row:start_row+self.num_stocks]

        date_id = self.date_ids[start_row]
        adj_matrix = self.adjacency_matrices[date_id]
        
        return (
            torch.tensor(features), 
            torch.tensor(targets), 
            torch.tensor(masks), 
            torch.tensor(weights), 
            torch.tensor(symbols),
            torch.tensor(adj_matrix, dtype=torch.int)
        )
        
class WeightedMSELoss(nn.Module):
    def __init__(self):
        super(WeightedMSELoss, self).__init__()
    
    def forward(self, predictions: Tensor, targets: Tensor, weights: Tensor) -> Tensor:
        squared_diff = (predictions - targets) ** 2
        weighted_squared_diff = weights * squared_diff
        return weighted_squared_diff.sum() / weights.sum()
    
class TransposeLayer(nn.Module):
    def __init__(self) -> None:
        super().__init__()

    def forward(self, input: Tensor) -> Tensor:
        return input.transpose(1, 2)
    
class GraphConvEncoderLayer(nn.Module):
    def __init__(self, hidden_dim, dim_feedforward_mult=4, dropout_rate=0.1):
        super(GraphConvEncoderLayer, self).__init__()
        
        self.graph_conv = GCNConv(
            in_channels=hidden_dim, 
            out_channels=hidden_dim
        )

        self.feedforward = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim * dim_feedforward_mult),
            nn.SiLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim * dim_feedforward_mult, hidden_dim)
        )

        self.norm1 = nn.LayerNorm(hidden_dim)
        self.norm2 = nn.LayerNorm(hidden_dim)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate)

    def forward(self, x, edge_index):
        batch_size, num_nodes, num_features = x.size()

        residual = x
        x = x.reshape(batch_size * num_nodes, num_features)
        x = self.graph_conv(x, edge_index)
        x = x.reshape(batch_size, num_nodes, num_features)        
        x = self.dropout1(x) + residual
        x = self.norm1(x)

        residual = x
        x = self.feedforward(x)
        x = self.dropout2(x) + residual
        x = self.norm2(x)

        return x
    

class GraphConvEncoder(nn.Module):
    def __init__(self, hidden_dim, num_layers, dim_feedforward_mult=4, dropout_rate=0.1):
        super(GraphConvEncoder, self).__init__()
        self.layers = nn.ModuleList([
            GraphConvEncoderLayer(
                hidden_dim=hidden_dim,
                dim_feedforward_mult=dim_feedforward_mult,
                dropout_rate=dropout_rate
            ) for _ in range(num_layers)
        ])

    def forward(self, x, adj):
        batch_size, num_nodes, _ = x.size()

        edge_indices = []
        for batch_idx in range(batch_size):
            adj_matrix = adj[batch_idx]
            src, tgt = torch.nonzero(adj_matrix, as_tuple=True)
            src = src + batch_idx * num_nodes
            tgt = tgt + batch_idx * num_nodes
            edge_indices.append(torch.stack([src, tgt], dim=0))

        edge_index = torch.cat(edge_indices, dim=1).to(x.device)
        
        for layer in self.layers:
            x = layer(x, edge_index)
        return x
    

class StockGCNModel(nn.Module):
    def __init__(
        self,
        input_features,
        hidden_dim=64,
        output_dim=1,
        num_layers=2,
        num_stocks=39,
        embedding_dim=16,
        use_embeddings=False,
        dropout_rate=0.2,
        dim_feedforward_mult=4,
    ):
        super(StockGCNModel, self).__init__()

        self.use_embeddings = use_embeddings

        self.init_layers = nn.Sequential(
            # TransposeLayer(),
            # nn.BatchNorm1d(input_features),
            # TransposeLayer(),
            nn.Dropout(dropout_rate),
        )
        self.feature_projector = []
        if use_embeddings:
            self.feature_projector.append(nn.Linear(input_features + embedding_dim, hidden_dim))
            self.embedding_layer = nn.Embedding(num_stocks, embedding_dim)
        else:
            self.feature_projector.append(nn.Linear(input_features, hidden_dim))
        self.feature_projector += [
            # TransposeLayer(),
            # nn.BatchNorm1d(hidden_dim),
            # TransposeLayer(),
            nn.Dropout(dropout_rate),
        ]
        self.feature_projector = nn.Sequential(*self.feature_projector)

        self.encoder = GraphConvEncoder(
            hidden_dim=hidden_dim,
            num_layers=num_layers,
            dim_feedforward_mult=dim_feedforward_mult,
            dropout_rate=dropout_rate
        )

        self.predictor = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            # TransposeLayer(),
            # nn.BatchNorm1d(hidden_dim),
            # TransposeLayer(),
            nn.SiLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, x, symbols, adj):
        batch_size, num_stocks, num_features = x.size()

        x = self.init_layers(x)
        if self.use_embeddings:
            stock_embeddings = self.embedding_layer(symbols)
            x = torch.cat([x, stock_embeddings], dim=-1)
        x = self.feature_projector(x)
        x = self.encoder(x, adj)

        output = self.predictor(x)
        return 5 * torch.tanh(output)
    
    def predict(self, dl, device):
        with torch.no_grad():
            y_out = []
            for x, targets, m, w, s, A in dl:
                    y_out.append(self(x.to(device), s.to(device), A.to(device)).squeeze().cpu().numpy())
            
            return np.concatenate(y_out)
                
    
def evaluate_model(model, val_dl, device):
    ss_res = 0.0
    ss_tot = 0.0
    for x, targets, m, w, s, A in val_dl:
        with torch.no_grad():
            y_out = model(x.to(device), s.to(device), A.to(device)).squeeze()
        w = w.to(device)
        targets = targets.to(device)
        ss_res += (w * (y_out - targets) ** 2).sum().cpu()
        ss_tot += (w * (targets ** 2)).sum().cpu()
    return 1 - ss_res / ss_tot

In [74]:
adjacency_matrices = np.load('/home/lorecampa/projects/jane_street_forecasting/dataset/sources/graph_conv_torch/adjacency_matrices.npy')

In [75]:
if torch.cuda.is_available():
    device = 'cuda:0'
else:
    device = 'cpu'

In [76]:

save_path = '/home/lorecampa/projects/jane_street_forecasting/dataset/models/graph_conv/model_3_7.pth'
model = StockGCNModel(
    input_features=79,
    output_dim=1,
    num_layers=1,
    dropout_rate=0.2,
    dim_feedforward_mult=4,
    hidden_dim=64
)
model.load_state_dict(torch.load(save_path, weights_only=True, map_location=torch.device(device)))
model = model.to(device)

graph_conv_models = [model]
graph_conv_agents_label = ['graph_conv_1']

In [77]:
test_dataset = JaneStreetMultiStockGraphDataset(test_ds, adjacency_matrices)
test_dataloader = DataLoader(test_dataset, batch_size=2048, shuffle=False, num_workers=0)


graph_conv_predictions = model.predict(test_dataloader, device).flatten().reshape(-1, 1)
graph_conv_predictions.shape

(2302872, 1)

In [79]:
lgbm_predictions.shape

(2187680, 2)

In [None]:
lg

In [80]:
predictions = np.concatenate([lgbm_predictions], axis=1)
agent_labels = ['lgbm_1', 'lgbm_2']
predictions.shape

(2187680, 2)

# Ens

In [81]:
from sklearn.metrics import r2_score
from prj.metrics import weighted_mae, weighted_mse, weighted_rmse

def metrics(y_true, y_pred, weights):
    
    return {
        'r2_w': r2_score(y_true, y_pred, sample_weight=weights),
        'mae_w': weighted_mae(y_true, y_pred, weights=weights),
        'mse_w': weighted_mse(y_true, y_pred, weights=weights),
        'rmse_w': weighted_rmse(y_true, y_pred, weights=weights),
    }

In [82]:
model = CatBoostRegressor(iterations=100, learning_rate=0.01, depth=5, task_type='GPU')

X_train, y_train, w_train = predictions[:start_test_idx, :], y[:start_test_idx], w[:start_test_idx]
X_test, y_test, w_test = predictions[start_test_idx:, :], y[start_test_idx:], w[start_test_idx:]


X_train = np.concatenate([X[:start_test_idx, :], lgbm_predictions[:start_test_idx, :]], axis=1)

model.fit(X_train, y_train, sample_weight=None, verbose=50)

y_pred = model.predict(X_test)
res = metrics(y_test, y_pred, w_test)


0:	learn: 1.0245501	total: 15.7ms	remaining: 1.56s
50:	learn: 1.0206023	total: 753ms	remaining: 723ms
99:	learn: 1.0189951	total: 1.47s	remaining: 0us


In [83]:
from sklearn.metrics import r2_score
from prj.metrics import weighted_mae, weighted_mse, weighted_rmse

def metrics(y_true, y_pred, weights):
    return {
        'r2_w': r2_score(y_true, y_pred, sample_weight=weights),
        'mae_w': weighted_mae(y_true, y_pred, weights=weights),
        'mse_w': weighted_mse(y_true, y_pred, weights=weights),
        'rmse_w': weighted_rmse(y_true, y_pred, weights=weights),
    }
    
results = {}
columns = list(res.keys())
results['ensemble'] = res.values()
for i in range(len(agent_labels)):
    results[agent_labels[i]] = metrics(y[start_test_idx:], predictions[start_test_idx:, i], w[start_test_idx:]).values()
results['mean'] = metrics(y[start_test_idx:], np.mean(predictions[start_test_idx:], axis=1), w[start_test_idx:]).values()
results['median'] = metrics(y[start_test_idx:], np.median(predictions[start_test_idx:], axis=1), w[start_test_idx:]).values()


results = pl.DataFrame(results)\
    .transpose(include_header=True, column_names=columns, header_name='Agent')\
    .sort('r2_w', descending=True) \
    .to_pandas().set_index('Agent')
results

Unnamed: 0_level_0,r2_w,mae_w,mse_w,rmse_w
Agent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mean,0.014817,0.541524,0.71056,0.842947
median,0.014817,0.541524,0.71056,0.842947
lgbm_2,0.01449,0.541683,0.710796,0.843087
lgbm_1,0.014368,0.541667,0.710885,0.84314
ensemble,0.010996,0.541728,0.713317,0.844581
