In [1]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [2]:
import numpy as np
import polars as pl
from pathlib import Path
import gc
import os
from typing import List, Union, Dict, Any
import random

import torch
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from torch_geometric.nn import GCNConv

In [3]:
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

In [4]:
BASE_PATH = Path('/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet')
feature_cols = [f'feature_{i:02d}' for i in range(79)]

train_ds = pl.concat([
    pl.scan_parquet(BASE_PATH / f'partition_id={i}' / 'part-0.parquet')
    for i in range(5, 9)
]).sort(['date_id', 'time_id', 'symbol_id']).fill_nan(0).fill_null(0)

max_date = train_ds.select(pl.col('date_id').max()).collect().item()
val_ds = train_ds.filter(pl.col('date_id') > max_date - 30)
train_ds = train_ds.filter(pl.col('date_id') <= max_date - 30)

adjacency_matrices = np.load('/kaggle/input/jane-street-2024-graph-computation/adjacency_matrices.npy')

In [5]:
def get_device():
    if torch.cuda.is_available():
        return 'cuda'
    return 'cpu'

In [6]:
class JaneStreetMultiStockGraphDatasetLazy(Dataset):
    
    def __init__(self, dataset: pl.LazyFrame, adjacency_matrices: np.ndarray, num_stocks: int = 39):
        self.dataset = dataset
        self.adjacency_matrices = adjacency_matrices
        self.num_stocks = num_stocks
        self.dataset_len = self.dataset.select(['date_id', 'time_id']).unique().collect().shape[0]
        self._load()
    
    def _load(self):
        all_combinations = (
            self.dataset.select(['date_id', 'time_id'])
            .unique()
            .join(pl.DataFrame({'symbol_id': list(range(self.num_stocks))}, 
                               schema={'symbol_id': pl.Int8}).lazy(), how="cross")
        )
        feature_cols = [f'feature_{i:02d}' for i in range(79)]
        self.batch = (
            all_combinations
            .join(self.dataset.with_columns(pl.lit(1).alias('mask')), 
                  on=['date_id', 'time_id', 'symbol_id'], how="left")
            .fill_null(0)  # fill all columns with 0 for missing stocks (including the mask)
            .sort(['date_id', 'time_id', 'symbol_id'])
        )
        # num_stocks rows for each date and time
        self.X = self.batch.select(feature_cols).collect().to_numpy().astype(np.float32)
        self.y = self.batch.select(['responder_6']).collect().to_numpy().flatten().astype(np.float32)
        self.s = self.batch.select(['symbol_id']).collect().to_numpy().flatten().astype(np.int32)
        self.date_ids = self.batch.select(['date_id']).collect().to_numpy().flatten()
        self.masks = self.batch.select(['mask']).collect().to_numpy().flatten() == 0
        self.weights = self.batch.select(['weight']).collect().to_numpy().flatten().astype(np.float32)
    
    def __len__(self):
        return self.dataset_len
    
    def __getitem__(self, idx):
        start_row = idx * self.num_stocks
        features = self.X[start_row:start_row+self.num_stocks, :]
        targets = self.y[start_row:start_row+self.num_stocks]
        masks = self.masks[start_row:start_row+self.num_stocks]
        weights = self.weights[start_row:start_row+self.num_stocks]
        symbols = self.s[start_row:start_row+self.num_stocks]

        date_id = self.date_ids[start_row]
        adj_matrix = self.adjacency_matrices[date_id]
        
        return (
            torch.tensor(features), 
            torch.tensor(targets), 
            torch.tensor(masks), 
            torch.tensor(weights), 
            torch.tensor(symbols),
            torch.tensor(adj_matrix, dtype=torch.int)
        )

In [7]:
class JaneStreetMultiStockGraphDataset(Dataset):
    
    def __init__(self, dataset: pl.DataFrame, adjacency_matrices: np.ndarray, num_stocks: int = 39):
        self.dataset = dataset
        self.adjacency_matrices = adjacency_matrices
        self.num_stocks = num_stocks
        self.dataset_len = self.dataset.select(['date_id', 'time_id']).unique().shape[0]
        self._load()
    
    def _load(self):
        all_combinations = (
            self.dataset.select(['date_id', 'time_id'])
            .unique()
            .join(pl.DataFrame({'symbol_id': list(range(self.num_stocks))}, 
                               schema={'symbol_id': pl.Int8}), how="cross")
        )
        feature_cols = [f'feature_{i:02d}' for i in range(79)]
        self.batch = (
            all_combinations
            .join(self.dataset.with_columns(pl.lit(1).alias('mask')), 
                  on=['date_id', 'time_id', 'symbol_id'], how="left")
            .fill_null(0)  # fill all columns with 0 for missing stocks (including the mask)
            .sort(['date_id', 'time_id', 'symbol_id'])
        )
        # num_stocks rows for each date and time
        self.X = self.batch.select(feature_cols).to_numpy().astype(np.float32)
        self.y = self.batch.select(['responder_6']).to_numpy().flatten().astype(np.float32)
        self.s = self.batch.select(['symbol_id']).to_numpy().flatten().astype(np.int32)
        self.date_ids = self.batch.select(['date_id']).to_numpy().flatten()
        self.masks = self.batch.select(['mask']).to_numpy().flatten() == 0
        self.weights = self.batch.select(['weight']).to_numpy().flatten().astype(np.float32)
    
    def __len__(self):
        return self.dataset_len
    
    def __getitem__(self, idx):
        start_row = idx * self.num_stocks
        features = self.X[start_row:start_row+self.num_stocks, :]
        targets = self.y[start_row:start_row+self.num_stocks]
        masks = self.masks[start_row:start_row+self.num_stocks]
        weights = self.weights[start_row:start_row+self.num_stocks]
        symbols = self.s[start_row:start_row+self.num_stocks]

        date_id = self.date_ids[start_row]
        adj_matrix = self.adjacency_matrices[date_id]
        
        return (
            torch.tensor(features), 
            torch.tensor(targets), 
            torch.tensor(masks), 
            torch.tensor(weights), 
            torch.tensor(symbols),
            torch.tensor(adj_matrix, dtype=torch.int)
        )

In [8]:
class WeightedMSELoss(nn.Module):
    def __init__(self):
        super(WeightedMSELoss, self).__init__()
    
    def forward(self, predictions: Tensor, targets: Tensor, weights: Tensor) -> Tensor:
        squared_diff = (predictions - targets) ** 2
        weighted_squared_diff = weights * squared_diff
        return weighted_squared_diff.sum() / weights.sum()

In [9]:
class TransposeLayer(nn.Module):
    def __init__(self) -> None:
        super().__init__()

    def forward(self, input: Tensor) -> Tensor:
        return input.transpose(1, 2)

In [10]:
class GraphConvEncoderLayer(nn.Module):
    def __init__(self, hidden_dim, dim_feedforward_mult=4, dropout_rate=0.1):
        super(GraphConvEncoderLayer, self).__init__()
        
        self.graph_conv = GCNConv(
            in_channels=hidden_dim, 
            out_channels=hidden_dim
        )

        self.feedforward = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim * dim_feedforward_mult),
            nn.SiLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim * dim_feedforward_mult, hidden_dim)
        )

        self.norm1 = nn.LayerNorm(hidden_dim)
        self.norm2 = nn.LayerNorm(hidden_dim)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate)

    def forward(self, x, edge_index):
        batch_size, num_nodes, num_features = x.size()

        residual = x
        x = x.reshape(batch_size * num_nodes, num_features)
        x = self.graph_conv(x, edge_index)
        x = x.reshape(batch_size, num_nodes, num_features)        
        x = self.dropout1(x) + residual
        x = self.norm1(x)

        residual = x
        x = self.feedforward(x)
        x = self.dropout2(x) + residual
        x = self.norm2(x)

        return x

In [11]:
class GraphConvEncoder(nn.Module):
    def __init__(self, hidden_dim, num_layers, dim_feedforward_mult=4, dropout_rate=0.1):
        super(GraphConvEncoder, self).__init__()
        self.layers = nn.ModuleList([
            GraphConvEncoderLayer(
                hidden_dim=hidden_dim,
                dim_feedforward_mult=dim_feedforward_mult,
                dropout_rate=dropout_rate
            ) for _ in range(num_layers)
        ])

    def forward(self, x, adj):
        batch_size, num_nodes, _ = x.size()

        edge_indices = []
        for batch_idx in range(batch_size):
            adj_matrix = adj[batch_idx]
            src, tgt = torch.nonzero(adj_matrix, as_tuple=True)
            src = src + batch_idx * num_nodes
            tgt = tgt + batch_idx * num_nodes
            edge_indices.append(torch.stack([src, tgt], dim=0))

        edge_index = torch.cat(edge_indices, dim=1).to(x.device)
        
        for layer in self.layers:
            x = layer(x, edge_index)
        return x

In [12]:
class StockGCNModel(nn.Module):
    def __init__(
        self,
        input_features,
        hidden_dim=64,
        output_dim=1,
        num_layers=2,
        num_stocks=39,
        embedding_dim=16,
        use_embeddings=False,
        dropout_rate=0.2,
        dim_feedforward_mult=4,
    ):
        super(StockGCNModel, self).__init__()

        self.use_embeddings = use_embeddings

        self.init_layers = nn.Sequential(
            # TransposeLayer(),
            # nn.BatchNorm1d(input_features),
            # TransposeLayer(),
            nn.Dropout(dropout_rate),
        )
        self.feature_projector = []
        if use_embeddings:
            self.feature_projector.append(nn.Linear(input_features + embedding_dim, hidden_dim))
            self.embedding_layer = nn.Embedding(num_stocks, embedding_dim)
        else:
            self.feature_projector.append(nn.Linear(input_features, hidden_dim))
        self.feature_projector += [
            # TransposeLayer(),
            # nn.BatchNorm1d(hidden_dim),
            # TransposeLayer(),
            nn.Dropout(dropout_rate),
        ]
        self.feature_projector = nn.Sequential(*self.feature_projector)

        self.encoder = GraphConvEncoder(
            hidden_dim=hidden_dim,
            num_layers=num_layers,
            dim_feedforward_mult=dim_feedforward_mult,
            dropout_rate=dropout_rate
        )

        self.predictor = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            # TransposeLayer(),
            # nn.BatchNorm1d(hidden_dim),
            # TransposeLayer(),
            nn.SiLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, x, symbols, adj):
        batch_size, num_stocks, num_features = x.size()

        x = self.init_layers(x)
        if self.use_embeddings:
            stock_embeddings = self.embedding_layer(symbols)
            x = torch.cat([x, stock_embeddings], dim=-1)
        x = self.feature_projector(x)
        x = self.encoder(x, adj)

        output = self.predictor(x)
        return 5 * torch.tanh(output)

In [13]:
def evaluate_model(model, val_dl, device):
    ss_res = 0.0
    ss_tot = 0.0
    for x, targets, m, w, s, A in val_dl:
        with torch.no_grad():
            y_out = model(x.to(device), s.to(device), A.to(device)).squeeze()
        w = w.to(device)
        targets = targets.to(device)
        ss_res += (w * (y_out - targets) ** 2).sum().cpu()
        ss_tot += (w * (targets ** 2)).sum().cpu()
    return 1 - ss_res / ss_tot

In [14]:
def train_with_es(model, optimizer, train_dl, val_dl, epochs_max, gradient_clipping, loss_fn, is_weighted_loss, output_dir, es_patience, device, scheduler):
    
    save_path = os.path.join(output_dir, 'best_model.pth')
    torch.save(model.state_dict(), save_path)
    best_score = evaluate_model(model, val_dl, device).item()
    print(f'Initial weighted r2: {best_score}')
    best_epoch = -1
    for epoch in range(epochs_max):
        model.train()
        for x, targets, m, w, s, A in train_dl:
            optimizer.zero_grad()
            y_out = model.forward(x.to(device), s.to(device), A.to(device)).squeeze()
            if is_weighted_loss:
                loss = loss_fn(y_out, targets.to(device), w.to(device))
            else:
                loss = loss_fn(y_out, targets.to(device))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), gradient_clipping)
            optimizer.step()
            
        model.eval()
        score = evaluate_model(model, val_dl, device).item()
        print(f'Epoch {epoch} weighted r2: {score}')
        if score > best_score:
            torch.save(model.state_dict(), save_path)
            best_epoch = epoch
            best_score = score
        elif epoch - best_epoch >= es_patience:
            print(f'Stopping after {epoch} epochs')
            break

        if scheduler:
            if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
                scheduler.step(score)
            else:
                scheduler.step()
        
    model.load_state_dict(torch.load(save_path, weights_only=True))
    model = model.to(device)
    return model, score

In [15]:
train_dataset = JaneStreetMultiStockGraphDatasetLazy(train_ds, adjacency_matrices)
val_dataset = JaneStreetMultiStockGraphDatasetLazy(val_ds, adjacency_matrices)

train_dataloader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=3)
val_dataloader = DataLoader(val_dataset, batch_size=2048, shuffle=False, num_workers=3)

In [16]:
if torch.cuda.is_available():
    device = 'cuda:0'
else:
    device = 'cpu'

In [17]:
model = StockGCNModel(
    input_features=79,
    output_dim=1,
    num_layers=1,
    dropout_rate=0.2,
    dim_feedforward_mult=4,
    hidden_dim=64)
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.001)
loss_fn = WeightedMSELoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=4, min_lr=1e-5)

In [18]:
checkpoint_dir = '/kaggle/working/model'
os.makedirs(checkpoint_dir)

In [19]:
model, _ = train_with_es(model, optimizer, train_dataloader, val_dataloader, 100, 10, loss_fn, True, checkpoint_dir, 10, device, scheduler)
model

Initial weighted r2: -1.876089096069336
Epoch 0 weighted r2: -0.0010010004043579102
Epoch 1 weighted r2: 0.00116652250289917
Epoch 2 weighted r2: 0.002052128314971924
Epoch 3 weighted r2: 0.003565490245819092
Epoch 4 weighted r2: 0.004752993583679199
Epoch 5 weighted r2: 0.005537688732147217
Epoch 6 weighted r2: 0.00625455379486084
Epoch 7 weighted r2: 0.007128000259399414
Epoch 8 weighted r2: 0.008665740489959717
Epoch 9 weighted r2: 0.009343147277832031
Epoch 10 weighted r2: 0.010233759880065918
Epoch 11 weighted r2: 0.010740399360656738
Epoch 12 weighted r2: 0.010953783988952637
Epoch 13 weighted r2: 0.011276841163635254
Epoch 14 weighted r2: 0.011699795722961426
Epoch 15 weighted r2: 0.011717677116394043
Epoch 16 weighted r2: 0.012309730052947998
Epoch 17 weighted r2: 0.012919783592224121
Epoch 18 weighted r2: 0.013102233409881592
Epoch 19 weighted r2: 0.01320505142211914
Epoch 20 weighted r2: 0.013232707977294922
Epoch 21 weighted r2: 0.013968884944915771
Epoch 22 weighted r2: 0.0

StockGCNModel(
  (init_layers): Sequential(
    (0): Dropout(p=0.2, inplace=False)
  )
  (feature_projector): Sequential(
    (0): Linear(in_features=79, out_features=64, bias=True)
    (1): Dropout(p=0.2, inplace=False)
  )
  (encoder): GraphConvEncoder(
    (layers): ModuleList(
      (0): GraphConvEncoderLayer(
        (graph_conv): GCNConv(64, 64)
        (feedforward): Sequential(
          (0): Linear(in_features=64, out_features=256, bias=True)
          (1): SiLU()
          (2): Dropout(p=0.2, inplace=False)
          (3): Linear(in_features=256, out_features=64, bias=True)
        )
        (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (predictor): Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): SiLU()
    (2): Dropout(p=0.2, inplace=False)


In [20]:
del train_dataset, val_dataset
gc.collect()

20

In [21]:
train_every = 21
last_n_days_es = 7
old_data_fraction = 0.5
batch_size = 2048
gradient_clipping_decay = 0.5
gradient_clipping = 10 * gradient_clipping_decay
lr_decay = 0.7
lr = 1e-5

In [22]:
offline_model = StockGCNModel(
    input_features=79,
    output_dim=1,
    num_layers=1,
    dropout_rate=0.2,
    dim_feedforward_mult=4,
    hidden_dim=64)
offline_model.load_state_dict(torch.load('/kaggle/working/model/best_model.pth', weights_only=True))
offline_model = offline_model.to(device)

online_learning_dataset = pl.scan_parquet(BASE_PATH / 'partition_id=9' / 'part-0.parquet') \
    .sort(['date_id', 'time_id', 'symbol_id']).fill_nan(0).fill_null(0).collect()
new_dataset = val_ds.collect()
old_dataset = train_ds.collect()

In [23]:
def weighted_r2_score(preds, targets, weights):
    ss_res = (weights * (targets - preds) ** 2).sum()
    ss_tot = (weights * (targets ** 2)).sum()
    return 1 - ss_res / ss_tot if ss_tot > 0 else 0.0

In [24]:
y_hat = []
y_hat_offline = []
y = []
weights = []
daily_r2 = []
daily_r2_offline = []
date_idx = 0
model.eval()

for date_id, test in online_learning_dataset.group_by('date_id', maintain_order=True):
            
    if date_idx % train_every == 0:
        
        model.train()
        max_date = new_dataset.select(pl.col('date_id').max()).item()
        new_validation_dataset = new_dataset.filter(pl.col('date_id') > max_date - last_n_days_es)
        new_training_dataset = new_dataset.filter(pl.col('date_id') <= max_date - last_n_days_es)
        old_data_len = old_data_fraction * new_training_dataset.shape[0] / (1 - old_data_fraction)
        time_factions = min(1, old_data_len / old_dataset.shape[0])
        old_date_times = old_dataset.select(['date_id', 'time_id']).unique().sample(fraction=time_factions)
        old_training_dataset = old_dataset.join(old_date_times, on=['date_id', 'time_id'], how='inner')
        train_dataloader = JaneStreetMultiStockGraphDataset(pl.concat([old_training_dataset, new_training_dataset]), adjacency_matrices)
        val_dataloader = JaneStreetMultiStockGraphDataset(new_validation_dataset, adjacency_matrices)
        train_dataloader = DataLoader(train_dataloader, shuffle=True, batch_size=512, num_workers=3)
        val_dataloader = DataLoader(val_dataloader, shuffle=False, batch_size=2048, num_workers=3)
        
        print(f'Starting fine tuning at date {date_id}')
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.001)
        model, _ = train_with_es(model, optimizer, train_dataloader, val_dataloader, 50, gradient_clipping, loss_fn, True, checkpoint_dir, 3, device, None)
        lr = max(lr * lr_decay, 1e-7)
        gradient_clipping = max(gradient_clipping * gradient_clipping_decay, 0.1)
        model.eval()
                
        old_dataset = old_dataset.vstack(new_training_dataset)
        new_dataset = new_validation_dataset

    date_idx += 1
    test_ = test.fill_null(0).fill_nan(0)      
    new_dataset = test_ if new_dataset is None else new_dataset.vstack(test_)

    predict_df = (
        test_.select(['date_id', 'time_id'])
        .unique()
        .join(pl.DataFrame({'symbol_id': list(range(39))}, 
                           schema={'symbol_id': pl.Int8}), how="cross")
        .join(test_.with_columns(pl.lit(1).alias('mask')), 
              on=['date_id', 'time_id', 'symbol_id'], how="left")
        .fill_null(0)  # fill all columns with 0 for missing stocks (including the mask)
        .sort(['date_id', 'time_id', 'symbol_id'])
    )
    valid_data = predict_df.select(['mask']).to_numpy().flatten() == 1
    x = torch.tensor(predict_df.select([f'feature_{i:02d}' for i in range(79)]).to_numpy().reshape(-1, 39, 79), dtype=torch.float32).to(device)
    s = torch.tensor(predict_df.select(['symbol_id']).to_numpy().flatten().reshape(-1, 39).astype(np.int32)).to(device)
    adj = adjacency_matrices[predict_df.select(pl.col('date_id').first()).item()][np.newaxis, :, :]
    adj = torch.tensor(adj, dtype=torch.int, device=device).repeat(x.shape[0], 1, 1)
    with torch.no_grad():
        preds = model(x, s, adj).cpu().numpy().flatten()
        preds_offline = offline_model(x, s, adj).cpu().numpy().flatten()
    y_hat_offline.append(preds_offline[valid_data])
    y_hat.append(preds[valid_data])
    y.append(predict_df.select(['responder_6']).to_numpy().flatten()[valid_data])
    weights.append(predict_df.select(['weight']).to_numpy().flatten()[valid_data])
    daily_r2.append(weighted_r2_score(y_hat[-1], y[-1], weights[-1]))
    daily_r2_offline.append(weighted_r2_score(y_hat_offline[-1], y[-1], weights[-1]))

score = weighted_r2_score(np.concatenate(y_hat), np.concatenate(y), np.concatenate(weights))
score_offline = weighted_r2_score(np.concatenate(y_hat_offline), np.concatenate(y), np.concatenate(weights))
daily_r2 = np.array(daily_r2)
daily_r2_offline = np.array(daily_r2_offline)
sharpe = np.mean(daily_r2) / np.std(daily_r2)
sharpe_offline = np.mean(daily_r2_offline) / np.std(daily_r2_offline)
stability_index = np.sum(daily_r2 > 0) / daily_r2.shape[0]
stability_index_offline = np.sum(daily_r2_offline > 0) / daily_r2_offline.shape[0]
(score_offline, score), (sharpe_offline, sharpe), (stability_index_offline, stability_index)

Starting fine tuning at date (1530,)
Initial weighted r2: 0.009824395179748535
Epoch 0 weighted r2: 0.013455092906951904
Epoch 1 weighted r2: 0.013493895530700684
Epoch 2 weighted r2: 0.013476252555847168
Epoch 3 weighted r2: 0.013643860816955566
Epoch 4 weighted r2: 0.01370149850845337
Epoch 5 weighted r2: 0.013728022575378418
Epoch 6 weighted r2: 0.014002084732055664
Epoch 7 weighted r2: 0.014057576656341553
Epoch 8 weighted r2: 0.01411139965057373
Epoch 9 weighted r2: 0.014052152633666992
Epoch 10 weighted r2: 0.014257192611694336
Epoch 11 weighted r2: 0.0142289400100708
Epoch 12 weighted r2: 0.014365434646606445
Epoch 13 weighted r2: 0.014370918273925781
Epoch 14 weighted r2: 0.01449728012084961
Epoch 15 weighted r2: 0.014590919017791748
Epoch 16 weighted r2: 0.014451920986175537
Epoch 17 weighted r2: 0.01451963186264038
Epoch 18 weighted r2: 0.01471400260925293
Epoch 19 weighted r2: 0.014532685279846191
Epoch 20 weighted r2: 0.014745175838470459
Epoch 21 weighted r2: 0.01462525129

((0.006262302398681641, 0.009832501411437988),
 (0.6035553669661436, 0.7865270420592765),
 (0.7514792899408284, 0.8698224852071006))