In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2Model, GPT2Config, PreTrainedModel
from datasets import load_dataset, Dataset, DatasetDict

from torch.utils.data import DataLoader
from transformers import Trainer, TrainingArguments

import math

In [2]:
class GPT2Trader(PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        
        # use levine 2020 layer numbers
        n_layer = round((math.log(config.n_embd) - 5.039) / 5.55e-2)
        n_layer = max(1, n_layer)
        print(f'Using {n_layer} layers')
        config.n_layer = n_layer
        
        config.initializer_range = 1 / math.sqrt(config.n_embd)
        
        self.embed = nn.Linear(5, config.n_embd, bias = False)
        self.norm = nn.LayerNorm(config.n_embd)
        self.gpt = GPT2Model(config)
        self.copy = nn.Linear(config.n_embd, config.n_embd, bias = False)
        self.regress = nn.Linear(config.n_embd, 3, bias = False)
        self.classify = nn.Linear(config.n_embd, 2, bias = False)
        self.trade = nn.Linear(config.n_embd, 2, bias = False)
        
        self.MSELoss = nn.MSELoss()
        self.CrossEntropyLoss = nn.CrossEntropyLoss()
        
        self.trade_sign = nn.Parameter(torch.Tensor([1, -1]), requires_grad = False)
        
    def forward(self, ohlcv, future):
        embed = self.norm(self.embed(ohlcv))
        hidden = self.gpt(inputs_embeds = embed).last_hidden_state
        
        copy = self.copy(hidden).reshape(-1, hidden.shape[-1])
        regression_pred = self.regress(hidden).reshape(-1, 3)
        classification = self.classify(hidden).reshape(-1, 2)
        soft_trade = self.trade(hidden).reshape(-1, 2)
        
        # information
        original_close = ohlcv.select(index = 3, dim = -1).unsqueeze(-1)
        future_close = future.select(index = 2, dim = -1).unsqueeze(-1)
        delta = (original_close - future_close).reshape(-1)
        class_sign = (delta >= 0).long()
        
        # sharpe information
        soft_trade = F.softmax(soft_trade, dim = -1)
        soft_trade = soft_trade / (soft_trade.sum(dim = -1).unsqueeze(1) + 1e-5)
        soft_trade = soft_trade * self.trade_sign
        soft_trade = soft_trade.sum(dim = -1).reshape(-1)
        soft_trade = soft_trade * delta
        
        copy_loss = self.MSELoss(copy, embed.reshape(-1, hidden.shape[-1]).detach())
        mse_loss = self.MSELoss(regression_pred, future.reshape(-1, 3)) / 1e-3
        ce_loss = self.CrossEntropyLoss(classification, class_sign)
        sharpe = soft_trade.mean() * (delta.shape[0]**.5) / (soft_trade.std() + 1e-10)
        sharpe_loss = F.elu(-sharpe) + 1
        
        loss = sharpe_loss / 3 + ce_loss / 3 + mse_loss / 3 + copy_loss
        
        print(f"sharpe {sharpe.item():.2}, ce_loss {ce_loss.item():.4}, mse_loss {mse_loss.item():.4}, copy_loss {copy_loss.item():.4}")
        print(f"preds {regression_pred[0].detach(), future.reshape(-1, 3)[0].detach()}")
        
        return {"loss": loss}

In [3]:
class TraderTrainer(Trainer):
#     def compute_loss(self, model, inputs, return_outputs=False):
#         labels = inputs.get("labels")
#         # forward pass
#         outputs = model(**inputs)
#         logits = outputs.get("logits")
#         # compute custom loss (suppose one has 3 labels with different weights)
#         loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0]))
#         loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
#         return (loss, outputs) if return_outputs else loss

    def get_train_dataloader(self) -> DataLoader:
        """
        Returns the training :class:`~torch.utils.data.DataLoader`.

        Will use no sampler if :obj:`self.train_dataset` does not implement :obj:`__len__`, a random sampler (adapted
        to distributed training if necessary) otherwise.

        Subclass and override this method if you want to inject some custom behavior.
        """
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")
        train_sampler = self._get_train_sampler()

        return DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
#             shuffle=False, # IMPORTANT TO STOP OVERFITTING
            sampler=train_sampler,
            collate_fn=self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
        )

In [4]:
eurusd = Dataset.from_parquet('data/EURUSD_day.pq', )
eurusd = DatasetDict({
    'train': Dataset.from_dict(eurusd[:3000]),
    'validation': Dataset.from_dict(eurusd[3000:3200]),
    'test': Dataset.from_dict(eurusd[3200:])
})

Using custom data configuration default-a5e648de2e586aaa
Found cached dataset parquet (C:/Users/micha/.cache/huggingface/datasets/parquet/default-a5e648de2e586aaa/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [5]:
def step(self, closure = None):
    """
    Performs a single optimization step.
    Arguments:
        closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
    """
    loss = None
    if closure is not None:
        loss = closure()

    for group in self.param_groups:
        for p in group["params"]:
            if p.grad is None:
                continue
#             grad = -p.grad.data # SEE NEGATIVE SIGN FOR MAXIMIZATION
            grad = p.grad.data
            if grad.is_sparse:
                raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")

            state = self.state[p]

            # State initialization
            if len(state) == 0:
                state["step"] = 0
                # Exponential moving average of gradient values
                state["exp_avg"] = torch.zeros_like(p.data)
                # Exponential moving average of squared gradient values
                state["exp_avg_sq"] = torch.zeros_like(p.data)

            exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
            beta1, beta2 = group["betas"]

            state["step"] += 1

            # Decay the first and second moment running average coefficient
            # In-place operations to update the averages at the same time
            exp_avg.mul_(beta1).add_(grad, alpha=(1.0 - beta1))
            exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
            denom = exp_avg_sq.sqrt().add_(group["eps"])

            step_size = group["lr"]
            if group["correct_bias"]:  # No bias correction for Bert
                bias_correction1 = 1.0 - beta1 ** state["step"]
                bias_correction2 = 1.0 - beta2 ** state["step"]
                step_size = step_size * math.sqrt(bias_correction2) / bias_correction1

            p.data.addcdiv_(exp_avg, denom, value=-step_size)

            # Just adding the square of the weights to the loss function is *not*
            # the correct way of using L2 regularization/weight decay with Adam,
            # since that will interact with the m and v parameters in strange ways.
            #
            # Instead we want to decay the weights in a manner that doesn't interact
            # with the m/v parameters. This is equivalent to adding the square
            # of the weights to the loss with plain (non-momentum) SGD.
            # Add weight decay at the end (fixed version)
            if group["weight_decay"] > 0.0:
                p.data.add_(p.data, alpha=(-group["lr"] * group["weight_decay"]))

    return loss

from transformers import AdamW
AdamW.step = step

In [6]:
training_args = TrainingArguments(
    output_dir = "./results",
    logging_strategy = "steps",
    evaluation_strategy = "steps",
    logging_steps = 50,
    eval_steps = 50,
    report_to = "none",
    learning_rate = 5e-5,
    lr_scheduler_type = "cosine",
    warmup_ratio = .05,
    num_train_epochs = 1,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    max_grad_norm = 1,
)

In [7]:
config = GPT2Config(
    n_embd = 384, n_head = 6, vocab_size = 0, n_positions = 2000,
#     resid_pdrop = .1, embd_pdrop = .1, attn_pdrop = .1,
    resid_pdrop = 0, embd_pdrop = 0, attn_pdrop = 0,
    scale_attn_by_inverse_layer_idx = True, use_cache = False
)

In [8]:
model = GPT2Trader(config).cuda()
trainer = TraderTrainer(
    model = model,
    args = training_args,
    train_dataset = eurusd['train'],
    eval_dataset = eurusd['validation']
)

Using 16 layers


In [9]:
trainer.train()

***** Running training *****
  Num examples = 3000
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 1500


sharpe 2.3e+01, ce_loss 0.8422, mse_loss 580.3, copy_loss 1.293
preds (tensor([1.0617, 0.4326, 0.2934], device='cuda:0'), tensor([1.1062, 1.1057, 1.1061], device='cuda:0'))


Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss
50,42.17,3.153074
100,17.207,10.8901
150,13.6158,2.110154
200,11.2532,2.98183
250,13.1341,5.295961
300,9.0155,2.010702
350,8.9255,5.800998
400,7.4607,5.096503
450,7.9541,4.426535
500,7.6401,1.911132


sharpe -2.7, ce_loss 0.7619, mse_loss 670.2, copy_loss 1.296
preds (tensor([1.2061, 0.5535, 0.2695], device='cuda:0'), tensor([1.1707, 1.1704, 1.1707], device='cuda:0'))
sharpe -1.2e+01, ce_loss 0.7115, mse_loss 392.9, copy_loss 1.292
preds (tensor([1.2024, 0.5961, 0.3222], device='cuda:0'), tensor([1.0869, 1.0864, 1.0865], device='cuda:0'))
sharpe -3.6, ce_loss 0.7681, mse_loss 451.3, copy_loss 1.29
preds (tensor([0.8862, 0.4900, 0.4494], device='cuda:0'), tensor([1.3611, 1.3599, 1.3600], device='cuda:0'))
sharpe 5.1, ce_loss 0.7317, mse_loss 334.9, copy_loss 1.295
preds (tensor([ 0.6291,  1.1317, -0.1130], device='cuda:0'), tensor([1.2714, 1.2693, 1.2693], device='cuda:0'))
sharpe 7.2, ce_loss 0.8035, mse_loss 72.03, copy_loss 1.28
preds (tensor([0.9786, 0.9038, 0.8101], device='cuda:0'), tensor([1.1840, 1.1833, 1.1837], device='cuda:0'))
sharpe 9.7, ce_loss 0.7994, mse_loss 80.12, copy_loss 1.289
preds (tensor([1.0665, 1.1797, 1.0732], device='cuda:0'), tensor([1.4394, 1.4366, 1.438

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.6, ce_loss 0.6852, mse_loss 0.6397, copy_loss 0.8642
preds (tensor([1.1847, 1.1705, 1.1501], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.9, ce_loss 0.6995, mse_loss 0.7108, copy_loss 0.8648
preds (tensor([1.1853, 1.1708, 1.1503], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.2, ce_loss 0.681, mse_loss 0.8158, copy_loss 0.867
preds (tensor([1.1881, 1.1720, 1.1508], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.1, ce_loss 0.689, mse_loss 0.875, copy_loss 0.8676
preds (tensor([1.1899, 1.1727, 1.1511], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6801, mse_loss 1.089, copy_loss 0.8631
preds (tensor([1.1879, 1.1719, 1.1508], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6846, mse_loss 1.772, copy_loss 0.8627
preds (tensor([1.1866, 1.1713, 1.1505], device='cuda:0'), tensor([1.2079, 1.206

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.6, ce_loss 0.6879, mse_loss 30.87, copy_loss 0.2765
preds (tensor([1.3608, 1.1870, 1.4156], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.8, ce_loss 0.6958, mse_loss 30.17, copy_loss 0.2773
preds (tensor([1.3610, 1.1873, 1.4158], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.3, ce_loss 0.6855, mse_loss 30.55, copy_loss 0.2813
preds (tensor([1.3622, 1.1884, 1.4170], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.1, ce_loss 0.69, mse_loss 29.64, copy_loss 0.2824
preds (tensor([1.3630, 1.1891, 1.4178], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6848, mse_loss 26.38, copy_loss 0.2738
preds (tensor([1.3622, 1.1883, 1.4169], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6874, mse_loss 23.26, copy_loss 0.2732
preds (tensor([1.3616, 1.1878, 1.4164], device='cuda:0'), tensor([1.2079, 1.2069,

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.6, ce_loss 0.6918, mse_loss 2.576, copy_loss 0.06465
preds (tensor([1.2410, 1.2393, 1.1944], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.8, ce_loss 0.6935, mse_loss 2.322, copy_loss 0.06554
preds (tensor([1.2413, 1.2395, 1.1945], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.3, ce_loss 0.6915, mse_loss 2.369, copy_loss 0.07052
preds (tensor([1.2425, 1.2401, 1.1948], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.1, ce_loss 0.6923, mse_loss 2.014, copy_loss 0.07182
preds (tensor([1.2432, 1.2405, 1.1950], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6912, mse_loss 1.408, copy_loss 0.06108
preds (tensor([1.2424, 1.2401, 1.1948], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6915, mse_loss 0.8123, copy_loss 0.06023
preds (tensor([1.2418, 1.2398, 1.1946], device='cuda:0'), tensor([1.2079

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.6, ce_loss 0.69, mse_loss 6.123, copy_loss 0.0204
preds (tensor([1.2909, 1.2291, 1.2367], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.8, ce_loss 0.6943, mse_loss 5.716, copy_loss 0.02132
preds (tensor([1.2912, 1.2293, 1.2367], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.4, ce_loss 0.6888, mse_loss 5.767, copy_loss 0.02663
preds (tensor([1.2923, 1.2299, 1.2370], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.2, ce_loss 0.6912, mse_loss 5.202, copy_loss 0.02804
preds (tensor([1.2931, 1.2303, 1.2372], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6884, mse_loss 4.074, copy_loss 0.01652
preds (tensor([1.2923, 1.2298, 1.2370], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6897, mse_loss 2.793, copy_loss 0.0156
preds (tensor([1.2917, 1.2296, 1.2368], device='cuda:0'), tensor([1.2079, 1.2

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.6, ce_loss 0.6938, mse_loss 10.95, copy_loss 0.0145
preds (tensor([1.1908, 1.2873, 1.0343], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.7, ce_loss 0.693, mse_loss 11.01, copy_loss 0.01549
preds (tensor([1.1910, 1.2875, 1.0344], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.5, ce_loss 0.6941, mse_loss 11.06, copy_loss 0.021
preds (tensor([1.1919, 1.2881, 1.0349], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.2, ce_loss 0.6935, mse_loss 11.15, copy_loss 0.02258
preds (tensor([1.1924, 1.2884, 1.0353], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.694, mse_loss 11.52, copy_loss 0.01055
preds (tensor([1.1918, 1.2880, 1.0349], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6935, mse_loss 12.29, copy_loss 0.009611
preds (tensor([1.1914, 1.2877, 1.0346], device='cuda:0'), tensor([1.2079, 1.2

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.6, ce_loss 0.7012, mse_loss 1.517, copy_loss 0.01376
preds (tensor([1.2274, 1.2130, 1.1500], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.7, ce_loss 0.6926, mse_loss 1.437, copy_loss 0.01468
preds (tensor([1.2277, 1.2132, 1.1501], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.5, ce_loss 0.7039, mse_loss 1.505, copy_loss 0.01999
preds (tensor([1.2290, 1.2141, 1.1507], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.2, ce_loss 0.6987, mse_loss 1.382, copy_loss 0.02144
preds (tensor([1.2299, 1.2146, 1.1511], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.7043, mse_loss 1.231, copy_loss 0.009883
preds (tensor([1.2290, 1.2140, 1.1507], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.7011, mse_loss 1.328, copy_loss 0.008965
preds (tensor([1.2283, 1.2136, 1.1504], device='cuda:0'), tensor([1.207

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.6, ce_loss 0.7018, mse_loss 14.2, copy_loss 0.01341
preds (tensor([1.3689, 1.2128, 1.2485], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.7, ce_loss 0.6927, mse_loss 13.74, copy_loss 0.01426
preds (tensor([1.3692, 1.2130, 1.2487], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.5, ce_loss 0.7047, mse_loss 13.9, copy_loss 0.0193
preds (tensor([1.3705, 1.2139, 1.2493], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.2, ce_loss 0.6992, mse_loss 13.09, copy_loss 0.02058
preds (tensor([1.3714, 1.2146, 1.2498], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.7052, mse_loss 11.36, copy_loss 0.009685
preds (tensor([1.3705, 1.2139, 1.2493], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.7017, mse_loss 9.399, copy_loss 0.008807
preds (tensor([1.3698, 1.2134, 1.2490], device='cuda:0'), tensor([1.2079, 

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.6, ce_loss 0.6889, mse_loss 11.39, copy_loss 0.01269
preds (tensor([1.2844, 1.2781, 1.2915], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.8, ce_loss 0.695, mse_loss 10.91, copy_loss 0.01356
preds (tensor([1.2847, 1.2784, 1.2917], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.5, ce_loss 0.6872, mse_loss 11.1, copy_loss 0.01853
preds (tensor([1.2860, 1.2797, 1.2926], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.2, ce_loss 0.6905, mse_loss 10.17, copy_loss 0.01987
preds (tensor([1.2868, 1.2805, 1.2932], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6869, mse_loss 8.285, copy_loss 0.00908
preds (tensor([1.2859, 1.2796, 1.2925], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6889, mse_loss 6.099, copy_loss 0.008225
preds (tensor([1.2853, 1.2790, 1.2921], device='cuda:0'), tensor([1.2079, 

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.6, ce_loss 0.6981, mse_loss 9.73, copy_loss 0.01251
preds (tensor([1.2540, 1.3140, 1.2490], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.8, ce_loss 0.6927, mse_loss 9.265, copy_loss 0.01328
preds (tensor([1.2543, 1.3143, 1.2492], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.5, ce_loss 0.6998, mse_loss 9.49, copy_loss 0.01794
preds (tensor([1.2560, 1.3154, 1.2502], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.2, ce_loss 0.6965, mse_loss 8.682, copy_loss 0.01913
preds (tensor([1.2571, 1.3161, 1.2508], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.7002, mse_loss 7.024, copy_loss 0.009045
preds (tensor([1.2559, 1.3153, 1.2502], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6981, mse_loss 5.161, copy_loss 0.008232
preds (tensor([1.2551, 1.3148, 1.2497], device='cuda:0'), tensor([1.2079,

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.6, ce_loss 0.6934, mse_loss 1.043, copy_loss 0.01174
preds (tensor([1.1442, 1.2063, 1.2112], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.8, ce_loss 0.6931, mse_loss 0.9986, copy_loss 0.01254
preds (tensor([1.1446, 1.2067, 1.2114], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.4, ce_loss 0.6935, mse_loss 1.045, copy_loss 0.01714
preds (tensor([1.1464, 1.2083, 1.2124], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.2, ce_loss 0.6932, mse_loss 1.006, copy_loss 0.0184
preds (tensor([1.1475, 1.2094, 1.2131], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6935, mse_loss 0.9681, copy_loss 0.008387
preds (tensor([1.1463, 1.2082, 1.2124], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6933, mse_loss 1.282, copy_loss 0.007602
preds (tensor([1.1454, 1.2074, 1.2119], device='cuda:0'), tensor([1.20

Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin


sharpe -6.8, ce_loss 0.6933, mse_loss 32.06, copy_loss 0.005506
preds (tensor([1.1947, 1.2537, 1.2413], device='cuda:0'), tensor([1.2584, 1.2574, 1.2584], device='cuda:0'))
sharpe 2.2, ce_loss 0.6932, mse_loss 5.475, copy_loss 0.009024
preds (tensor([1.1614, 1.2364, 1.1929], device='cuda:0'), tensor([1.1220, 1.1208, 1.1220], device='cuda:0'))
sharpe 1.6e+01, ce_loss 0.6934, mse_loss 5.57, copy_loss 0.01191
preds (tensor([1.2305, 1.2754, 1.2034], device='cuda:0'), tensor([1.2800, 1.2793, 1.2796], device='cuda:0'))
sharpe -1.9e+01, ce_loss 0.6934, mse_loss 26.97, copy_loss 0.01389
preds (tensor([1.1644, 1.1444, 1.1363], device='cuda:0'), tensor([1.2990, 1.2965, 1.2965], device='cuda:0'))
sharpe 0.1, ce_loss 0.693, mse_loss 12.71, copy_loss 0.02149
preds (tensor([1.2805, 1.3315, 1.2248], device='cuda:0'), tensor([1.2902, 1.2892, 1.2893], device='cuda:0'))
sharpe 0.75, ce_loss 0.6931, mse_loss 9.927, copy_loss 0.02325
preds (tensor([1.1904, 1.1418, 1.1733], device='cuda:0'), tensor([1.1098

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.6, ce_loss 0.6941, mse_loss 2.91, copy_loss 0.01097
preds (tensor([1.0912, 1.1904, 1.1430], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.7, ce_loss 0.693, mse_loss 3.05, copy_loss 0.01183
preds (tensor([1.0916, 1.1907, 1.1433], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.5, ce_loss 0.6945, mse_loss 2.951, copy_loss 0.01643
preds (tensor([1.0935, 1.1922, 1.1445], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.2, ce_loss 0.6937, mse_loss 3.331, copy_loss 0.01782
preds (tensor([1.0947, 1.1932, 1.1452], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6945, mse_loss 4.255, copy_loss 0.007718
preds (tensor([1.0934, 1.1921, 1.1444], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.694, mse_loss 5.754, copy_loss 0.006954
preds (tensor([1.0925, 1.1914, 1.1438], device='cuda:0'), tensor([1.2079, 1

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.6, ce_loss 0.689, mse_loss 6.266, copy_loss 0.01091
preds (tensor([1.1210, 1.1172, 1.0698], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.7, ce_loss 0.6949, mse_loss 6.64, copy_loss 0.0117
preds (tensor([1.1214, 1.1175, 1.0700], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.5, ce_loss 0.6872, mse_loss 6.482, copy_loss 0.01602
preds (tensor([1.1230, 1.1188, 1.0711], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.2, ce_loss 0.6906, mse_loss 7.196, copy_loss 0.01725
preds (tensor([1.1241, 1.1196, 1.0718], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6867, mse_loss 8.895, copy_loss 0.007809
preds (tensor([1.1229, 1.1187, 1.0710], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6888, mse_loss 11.4, copy_loss 0.007088
preds (tensor([1.1221, 1.1181, 1.0705], device='cuda:0'), tensor([1.2079, 1

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.6, ce_loss 0.6865, mse_loss 6.295, copy_loss 0.01058
preds (tensor([1.1944, 1.2663, 1.2839], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.8, ce_loss 0.6971, mse_loss 5.931, copy_loss 0.01137
preds (tensor([1.1947, 1.2666, 1.2841], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.5, ce_loss 0.6833, mse_loss 6.062, copy_loss 0.01577
preds (tensor([1.1963, 1.2677, 1.2850], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.2, ce_loss 0.6894, mse_loss 5.441, copy_loss 0.01706
preds (tensor([1.1973, 1.2685, 1.2856], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6826, mse_loss 4.367, copy_loss 0.007428
preds (tensor([1.1962, 1.2677, 1.2849], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6864, mse_loss 3.146, copy_loss 0.00668
preds (tensor([1.1954, 1.2671, 1.2845], device='cuda:0'), tensor([1.2079

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.6, ce_loss 0.6899, mse_loss 1.792, copy_loss 0.01016
preds (tensor([1.2111, 1.2224, 1.2279], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.8, ce_loss 0.6944, mse_loss 1.58, copy_loss 0.01096
preds (tensor([1.2114, 1.2226, 1.2281], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.5, ce_loss 0.6885, mse_loss 1.671, copy_loss 0.01528
preds (tensor([1.2128, 1.2238, 1.2290], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.2, ce_loss 0.6911, mse_loss 1.312, copy_loss 0.01661
preds (tensor([1.2136, 1.2246, 1.2296], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6882, mse_loss 0.7471, copy_loss 0.007111
preds (tensor([1.2127, 1.2238, 1.2290], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6898, mse_loss 0.2351, copy_loss 0.006389
preds (tensor([1.2120, 1.2232, 1.2285], device='cuda:0'), tensor([1.20

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.6, ce_loss 0.7053, mse_loss 5.711, copy_loss 0.00995
preds (tensor([1.2157, 1.2514, 1.2801], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.8, ce_loss 0.6932, mse_loss 5.365, copy_loss 0.01072
preds (tensor([1.2160, 1.2517, 1.2804], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.5, ce_loss 0.7091, mse_loss 5.544, copy_loss 0.01492
preds (tensor([1.2174, 1.2531, 1.2814], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.2, ce_loss 0.7018, mse_loss 4.912, copy_loss 0.01618
preds (tensor([1.2183, 1.2539, 1.2820], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.7098, mse_loss 3.745, copy_loss 0.006961
preds (tensor([1.2173, 1.2530, 1.2813], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.7053, mse_loss 2.485, copy_loss 0.006256
preds (tensor([1.2167, 1.2523, 1.2808], device='cuda:0'), tensor([1.207

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.7, ce_loss 0.6902, mse_loss 4.689, copy_loss 0.00945
preds (tensor([1.2186, 1.2473, 1.2624], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.7, ce_loss 0.6942, mse_loss 4.422, copy_loss 0.01017
preds (tensor([1.2190, 1.2477, 1.2626], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.5, ce_loss 0.6891, mse_loss 4.702, copy_loss 0.01409
preds (tensor([1.2208, 1.2494, 1.2639], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.2, ce_loss 0.6913, mse_loss 4.109, copy_loss 0.01526
preds (tensor([1.2220, 1.2506, 1.2647], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6887, mse_loss 2.895, copy_loss 0.006639
preds (tensor([1.2207, 1.2493, 1.2638], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6901, mse_loss 1.751, copy_loss 0.005985
preds (tensor([1.2198, 1.2485, 1.2632], device='cuda:0'), tensor([1.207

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.7, ce_loss 0.6959, mse_loss 3.222, copy_loss 0.009002
preds (tensor([1.1678, 1.2574, 1.2282], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.7, ce_loss 0.6926, mse_loss 3.088, copy_loss 0.009704
preds (tensor([1.1683, 1.2578, 1.2286], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.5, ce_loss 0.6969, mse_loss 3.363, copy_loss 0.01351
preds (tensor([1.1705, 1.2599, 1.2303], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.2, ce_loss 0.6949, mse_loss 2.991, copy_loss 0.01462
preds (tensor([1.1719, 1.2612, 1.2314], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.697, mse_loss 2.159, copy_loss 0.006298
preds (tensor([1.1704, 1.2598, 1.2302], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6957, mse_loss 1.623, copy_loss 0.005662
preds (tensor([1.1693, 1.2588, 1.2294], device='cuda:0'), tensor([1.20

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.6, ce_loss 0.6959, mse_loss 1.067, copy_loss 0.009443
preds (tensor([1.1537, 1.1311, 1.1486], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.7, ce_loss 0.6927, mse_loss 1.163, copy_loss 0.01017
preds (tensor([1.1543, 1.1316, 1.1490], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.5, ce_loss 0.6969, mse_loss 1.016, copy_loss 0.01399
preds (tensor([1.1571, 1.1340, 1.1511], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.2, ce_loss 0.695, mse_loss 1.4, copy_loss 0.01534
preds (tensor([1.1590, 1.1356, 1.1524], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6971, mse_loss 2.175, copy_loss 0.0067
preds (tensor([1.1570, 1.1339, 1.1510], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6958, mse_loss 3.541, copy_loss 0.006076
preds (tensor([1.1556, 1.1327, 1.1500], device='cuda:0'), tensor([1.2079, 1

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.6, ce_loss 0.6979, mse_loss 0.3491, copy_loss 0.008846
preds (tensor([1.2039, 1.1716, 1.1644], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.7, ce_loss 0.6926, mse_loss 0.3488, copy_loss 0.009484
preds (tensor([1.2044, 1.1720, 1.1648], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.5, ce_loss 0.6996, mse_loss 0.4195, copy_loss 0.01309
preds (tensor([1.2067, 1.1740, 1.1666], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.2, ce_loss 0.6964, mse_loss 0.4039, copy_loss 0.01417
preds (tensor([1.2083, 1.1753, 1.1679], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6999, mse_loss 0.4668, copy_loss 0.00622
preds (tensor([1.2066, 1.1739, 1.1665], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6979, mse_loss 0.9172, copy_loss 0.005608
preds (tensor([1.2055, 1.1730, 1.1656], device='cuda:0'), tensor

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.6, ce_loss 0.6907, mse_loss 4.433, copy_loss 0.008298
preds (tensor([1.2258, 1.2651, 1.2257], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.7, ce_loss 0.694, mse_loss 4.27, copy_loss 0.008964
preds (tensor([1.2264, 1.2656, 1.2262], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.5, ce_loss 0.6897, mse_loss 4.734, copy_loss 0.01253
preds (tensor([1.2288, 1.2677, 1.2282], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.2, ce_loss 0.6916, mse_loss 3.996, copy_loss 0.0137
preds (tensor([1.2304, 1.2691, 1.2296], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6895, mse_loss 2.779, copy_loss 0.00575
preds (tensor([1.2287, 1.2676, 1.2281], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6906, mse_loss 1.649, copy_loss 0.005159
preds (tensor([1.2275, 1.2666, 1.2271], device='cuda:0'), tensor([1.2079,

Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json
Model weights saved in ./results\checkpoint-1000\pytorch_model.bin


sharpe -9.6, ce_loss 0.6899, mse_loss 3.348, copy_loss 0.01188
preds (tensor([1.3960, 1.4141, 1.3945], device='cuda:0'), tensor([1.3065, 1.3057, 1.3065], device='cuda:0'))
sharpe 1.0, ce_loss 0.6941, mse_loss 12.08, copy_loss 0.0459
preds (tensor([1.2737, 1.3176, 1.2738], device='cuda:0'), tensor([1.2699, 1.2692, 1.2697], device='cuda:0'))
sharpe -0.55, ce_loss 0.6954, mse_loss 6.207, copy_loss 0.00372
preds (tensor([1.2250, 1.2724, 1.2361], device='cuda:0'), tensor([1.3534, 1.3501, 1.3531], device='cuda:0'))
sharpe 4.5, ce_loss 0.6921, mse_loss 13.32, copy_loss 0.02029
preds (tensor([1.2603, 1.2925, 1.2688], device='cuda:0'), tensor([1.1360, 1.1354, 1.1358], device='cuda:0'))
sharpe 5.3, ce_loss 0.6949, mse_loss 6.029, copy_loss 0.01369
preds (tensor([1.2842, 1.3214, 1.3165], device='cuda:0'), tensor([1.1962, 1.1956, 1.1960], device='cuda:0'))
sharpe 6.5, ce_loss 0.6965, mse_loss 2.195, copy_loss 0.01411
preds (tensor([1.3389, 1.3352, 1.3466], device='cuda:0'), tensor([1.3358, 1.3353,

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.7, ce_loss 0.6969, mse_loss 0.8448, copy_loss 0.008746
preds (tensor([1.2108, 1.2022, 1.1900], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.7, ce_loss 0.6926, mse_loss 0.7981, copy_loss 0.009372
preds (tensor([1.2114, 1.2028, 1.1905], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.5, ce_loss 0.6983, mse_loss 1.081, copy_loss 0.01283
preds (tensor([1.2143, 1.2052, 1.1925], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.2, ce_loss 0.6957, mse_loss 0.7807, copy_loss 0.01405
preds (tensor([1.2162, 1.2068, 1.1939], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6985, mse_loss 0.289, copy_loss 0.006187
preds (tensor([1.2141, 1.2051, 1.1924], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6969, mse_loss 0.1311, copy_loss 0.00561
preds (tensor([1.2127, 1.2039, 1.1914], device='cuda:0'), tensor([

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.7, ce_loss 0.6908, mse_loss 5.22, copy_loss 0.0111
preds (tensor([1.0935, 1.0989, 1.1004], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.6, ce_loss 0.694, mse_loss 5.154, copy_loss 0.01163
preds (tensor([1.0944, 1.0998, 1.1011], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.5, ce_loss 0.6898, mse_loss 4.404, copy_loss 0.01482
preds (tensor([1.0987, 1.1036, 1.1043], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.3, ce_loss 0.6916, mse_loss 5.535, copy_loss 0.01647
preds (tensor([1.1014, 1.1061, 1.1064], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6896, mse_loss 7.342, copy_loss 0.008529
preds (tensor([1.0984, 1.1034, 1.1041], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6907, mse_loss 9.892, copy_loss 0.008023
preds (tensor([1.0964, 1.1015, 1.1026], device='cuda:0'), tensor([1.2079, 

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.7, ce_loss 0.6936, mse_loss 0.1103, copy_loss 0.007534
preds (tensor([1.1606, 1.1649, 1.1673], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.7, ce_loss 0.6931, mse_loss 0.128, copy_loss 0.008036
preds (tensor([1.1614, 1.1657, 1.1680], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.5, ce_loss 0.6938, mse_loss 0.1611, copy_loss 0.01103
preds (tensor([1.1649, 1.1689, 1.1708], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.3, ce_loss 0.6934, mse_loss 0.3138, copy_loss 0.01201
preds (tensor([1.1672, 1.1710, 1.1726], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6937, mse_loss 0.5066, copy_loss 0.005261
preds (tensor([1.1647, 1.1687, 1.1706], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6934, mse_loss 1.27, copy_loss 0.004751
preds (tensor([1.1630, 1.1671, 1.1693], device='cuda:0'), tensor([

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.7, ce_loss 0.6998, mse_loss 2.256, copy_loss 0.009221
preds (tensor([1.1316, 1.1230, 1.1183], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.6, ce_loss 0.6925, mse_loss 2.276, copy_loss 0.009676
preds (tensor([1.1325, 1.1238, 1.1190], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.6, ce_loss 0.7021, mse_loss 1.866, copy_loss 0.01251
preds (tensor([1.1362, 1.1272, 1.1220], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.3, ce_loss 0.6977, mse_loss 2.599, copy_loss 0.01389
preds (tensor([1.1386, 1.1294, 1.1239], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.7024, mse_loss 3.725, copy_loss 0.006854
preds (tensor([1.1360, 1.1270, 1.1218], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6997, mse_loss 5.558, copy_loss 0.006388
preds (tensor([1.1342, 1.1254, 1.1203], device='cuda:0'), tensor([1.2

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.7, ce_loss 0.6963, mse_loss 0.2542, copy_loss 0.00809
preds (tensor([1.1481, 1.1666, 1.1763], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.6, ce_loss 0.6926, mse_loss 0.2717, copy_loss 0.00856
preds (tensor([1.1489, 1.1674, 1.1770], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.6, ce_loss 0.6976, mse_loss 0.2964, copy_loss 0.0114
preds (tensor([1.1524, 1.1706, 1.1799], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.3, ce_loss 0.6953, mse_loss 0.4618, copy_loss 0.0126
preds (tensor([1.1547, 1.1728, 1.1818], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6977, mse_loss 0.652, copy_loss 0.005797
preds (tensor([1.1522, 1.1705, 1.1797], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6962, mse_loss 1.43, copy_loss 0.005318
preds (tensor([1.1505, 1.1689, 1.1783], device='cuda:0'), tensor([1.20

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.8, ce_loss 0.6933, mse_loss 0.2515, copy_loss 0.008884
preds (tensor([1.1728, 1.1745, 1.1881], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.4, ce_loss 0.6931, mse_loss 0.2828, copy_loss 0.009287
preds (tensor([1.1738, 1.1754, 1.1889], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.7, ce_loss 0.6935, mse_loss 0.6009, copy_loss 0.01198
preds (tensor([1.1780, 1.1793, 1.1922], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.5, ce_loss 0.6932, mse_loss 0.5227, copy_loss 0.0133
preds (tensor([1.1807, 1.1818, 1.1944], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6934, mse_loss 0.1555, copy_loss 0.006559
preds (tensor([1.1777, 1.1791, 1.1920], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6932, mse_loss 0.4616, copy_loss 0.006104
preds (tensor([1.1757, 1.1772, 1.1904], device='cuda:0'), tensor

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.8, ce_loss 0.6934, mse_loss 0.7656, copy_loss 0.009431
preds (tensor([1.1366, 1.1462, 1.1473], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.4, ce_loss 0.6931, mse_loss 0.7572, copy_loss 0.009813
preds (tensor([1.1376, 1.1471, 1.1481], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.7, ce_loss 0.6935, mse_loss 0.5972, copy_loss 0.01245
preds (tensor([1.1420, 1.1512, 1.1516], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.5, ce_loss 0.6933, mse_loss 1.067, copy_loss 0.01389
preds (tensor([1.1449, 1.1539, 1.1540], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6935, mse_loss 1.579, copy_loss 0.007075
preds (tensor([1.1418, 1.1510, 1.1514], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6933, mse_loss 2.846, copy_loss 0.006635
preds (tensor([1.1396, 1.1490, 1.1497], device='cuda:0'), tensor([

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.8, ce_loss 0.6935, mse_loss 0.2163, copy_loss 0.008023
preds (tensor([1.1776, 1.1792, 1.1811], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.4, ce_loss 0.6931, mse_loss 0.2536, copy_loss 0.00842
preds (tensor([1.1785, 1.1801, 1.1818], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.7, ce_loss 0.6936, mse_loss 0.6078, copy_loss 0.01107
preds (tensor([1.1828, 1.1841, 1.1854], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.5, ce_loss 0.6933, mse_loss 0.4824, copy_loss 0.01229
preds (tensor([1.1856, 1.1868, 1.1877], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6936, mse_loss 0.09947, copy_loss 0.005769
preds (tensor([1.1826, 1.1839, 1.1852], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6933, mse_loss 0.3718, copy_loss 0.005318
preds (tensor([1.1805, 1.1820, 1.1835], device='cuda:0'), tenso

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.8, ce_loss 0.6936, mse_loss 0.1496, copy_loss 0.008007
preds (tensor([1.1709, 1.1757, 1.1795], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.4, ce_loss 0.6931, mse_loss 0.1793, copy_loss 0.008407
preds (tensor([1.1718, 1.1766, 1.1803], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.7, ce_loss 0.6938, mse_loss 0.4638, copy_loss 0.01106
preds (tensor([1.1761, 1.1806, 1.1839], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.5, ce_loss 0.6934, mse_loss 0.4057, copy_loss 0.01227
preds (tensor([1.1789, 1.1832, 1.1862], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6938, mse_loss 0.1412, copy_loss 0.005756
preds (tensor([1.1759, 1.1804, 1.1837], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6935, mse_loss 0.5288, copy_loss 0.005307
preds (tensor([1.1738, 1.1785, 1.1820], device='cuda:0'), tenso

***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


sharpe -9.8, ce_loss 0.6936, mse_loss 0.1461, copy_loss 0.008
preds (tensor([1.1715, 1.1758, 1.1789], device='cuda:0'), tensor([1.1782, 1.1778, 1.1779], device='cuda:0'))
sharpe -1.4, ce_loss 0.6931, mse_loss 0.1759, copy_loss 0.0084
preds (tensor([1.1725, 1.1767, 1.1797], device='cuda:0'), tensor([1.1867, 1.1859, 1.1862], device='cuda:0'))
sharpe -2.7, ce_loss 0.6938, mse_loss 0.4607, copy_loss 0.01105
preds (tensor([1.1767, 1.1807, 1.1832], device='cuda:0'), tensor([1.1876, 1.1872, 1.1874], device='cuda:0'))
sharpe -7.5, ce_loss 0.6934, mse_loss 0.4014, copy_loss 0.01227
preds (tensor([1.1795, 1.1833, 1.1855], device='cuda:0'), tensor([1.1905, 1.1896, 1.1905], device='cuda:0'))
sharpe -1.1e+01, ce_loss 0.6938, mse_loss 0.1382, copy_loss 0.005751
preds (tensor([1.1765, 1.1805, 1.1830], device='cuda:0'), tensor([1.1973, 1.1966, 1.1970], device='cuda:0'))
sharpe -7.1, ce_loss 0.6935, mse_loss 0.5254, copy_loss 0.005302
preds (tensor([1.1744, 1.1785, 1.1813], device='cuda:0'), tensor([1.

Saving model checkpoint to ./results\checkpoint-1500
Configuration saved in ./results\checkpoint-1500\config.json
Model weights saved in ./results\checkpoint-1500\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1500, training_loss=9.067436218261719, metrics={'train_runtime': 1278.2607, 'train_samples_per_second': 2.347, 'train_steps_per_second': 1.173, 'total_flos': 0.0, 'train_loss': 9.067436218261719, 'epoch': 1.0})

In [10]:
del trainer
del model
torch.cuda.empty_cache()

# Appendix

## quick timing check

In [None]:
model = GPT2Trader(config).cuda()

In [72]:
%%timeit
fake_data = torch.randn(4, 391, 256)
fake_data = fake_data.cuda()
model(fake_data)
cpu = fake_data.cpu()

28.2 ms ± 2.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [73]:
model = GPT2Trader(config).cpu()

Using 9 layers


In [74]:
%%timeit
fake_data = torch.randn(4, 391, 256)
model(fake_data)

748 ms ± 82.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
