In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2Model, GPT2Config, PreTrainedModel
from datasets import load_dataset, Dataset, DatasetDict

from torch.utils.data import DataLoader
from transformers import Trainer, TrainingArguments

import math
import numpy as np

# Competitive edge techniques:
- Custom loss function based on profit with trade sizing (more of a conceptual innovation)
- multiloss to integrate shorter and longer trades (practical innovation to unlock longer times and more data and better gradients)
- best in class transformer -- most people don't really understand how transformers even work lol
    - hopefully will get more data efficient models
- right now model is quite greedy (for my own sanity), but it's totally possible to consider more of a <50% accuracy model but just with higher upside

TODO:
- compute metrics functions
    - still need say a below <30min full trade %, accuracy profitability and over 30+ full trade % and accuracy and profitability
    - try 5e-4 lr
- Try sru++

- longer timeframe? like up to 120min? --
- data analysis on when/where the model trades (which time frame and what time period throughout day)
    - should I do a graphical analysis?
    - what about raw accuracy of big trades?
- trickier goal, do usd/jpy and usd/gbp, should they be integrated all at once though? (would be tricky to handle multiple datastreams) may leave this for later after setting up paper trading
    - would be fine to just use it as a transfer learning tool though

In [11]:
class GPT2Trader(PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        
        # use levine 2020 layer numbers
        n_layer = round((math.log(config.n_embd) - 5.039) / 5.55e-2)
        n_layer = max(1, n_layer)
        print(f'Using {n_layer} layers')
        config.n_layer = n_layer
        
        config.initializer_range = 1 / math.sqrt(config.n_embd)
        
        self.embed = nn.Linear(5, config.n_embd, bias = False)
        self.norm = nn.LayerNorm(config.n_embd)
        self.gpt = GPT2Model(config)
        self.trade = nn.Linear(config.n_embd, 120, bias = False)


    def forward(self, ohlcv, future):
        embed = self.norm(self.embed(ohlcv))
        hidden = self.gpt(inputs_embeds = embed).last_hidden_state
        
        soft_trade = self.trade(hidden)
        
        # sharpe information
        soft_trade = torch.tanh(soft_trade)
        soft_profit = soft_trade * future
        
        # the exp is so that loss is purely positive and minimizes toward 0 (also losses have more loss than profit)
        loss_ppl = torch.square(((-soft_profit + future.abs()))).mean()
        
        # penalty for big trades (to stop trading from happening with no profit)
        trade_penalty = soft_trade.abs().mean()
        
        loss = loss_ppl + .1 * trade_penalty # .1 means that a 100% position must make at least .1 of a std to offset loss
        
        return {
            'loss': loss,
            'profits': soft_profit,
            'trades': soft_trade,
        }

In [43]:
def compute_metrics(preds):
    soft_profit, soft_trade = preds.predictions
    abs_trade = np.abs(soft_trade)
    trades = abs_trade.sum()
    
    day_profits = soft_profit.sum(axis = (1, 2))
    
    metrics = {
        'day sharpe': day_profits.mean() / day_profits.std(),
        'trade %': trades * 100 / soft_profit.size,
        'full trade %': (abs_trade > .9).mean() * 100,
        'full trade accuracy': (soft_profit[abs_trade > .9] > 0).mean() * 100,
        'medium trade %': (abs_trade > .5).mean() * 100,
        'medium trade accuracy': (soft_profit[abs_trade > .5] > 0).mean() * 100,
    }
    
    # round the metrics
    metrics = {k: np.format_float_positional(v, precision = 2) for k, v in metrics.items()}
    
    return metrics

In [44]:
eurusd = Dataset.load_from_disk('data/EURUSD_day.ds')

# make splits
split = eurusd.train_test_split(.05, shuffle = False)
valid_test = split['test'].train_test_split(.5, shuffle = False)
eurusd = DatasetDict({
    'train': split['train'],
    'validation': valid_test['train'],
    'test': valid_test['test']
})

In [45]:
training_args = TrainingArguments(
    output_dir = "./results",
    logging_strategy = "steps",
    evaluation_strategy = "steps",
    logging_steps = 100,
    eval_steps = 100,
    report_to = "none",
    learning_rate = 1e-3,
    lr_scheduler_type = "cosine",
    warmup_ratio = .05,
    num_train_epochs = 1,
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    max_grad_norm = 1,
)

PyTorch: setting up devices


In [46]:
config = GPT2Config(
    n_embd = 384, n_head = 6, vocab_size = 0, n_positions = 2000,
    resid_pdrop = .01, embd_pdrop = .01, attn_pdrop = .01, # low dropout since only using 1 epoch training and to make model more robust to data issues (.1 has worse loss, accuracy & t-score)
    summary_first_dropout = 0, summary_proj_to_labels = False,
    scale_attn_by_inverse_layer_idx = True, use_cache = False
)

In [47]:
model = GPT2Trader(config).cuda()
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = eurusd['train'],
    eval_dataset = eurusd['validation'],
    compute_metrics = compute_metrics
)


Using 16 layers


In [48]:
# shuffled training
trainer.train()

***** Running training *****
  Num examples = 3270
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 3270
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Day sharpe,Trade %,Full trade %,Full trade accuracy,Medium trade %,Medium trade accuracy
100,0.7428,0.59886,1.4,53.69,0.34,52.07,58.1,52.67
200,1.0593,0.629185,1.19,35.43,0.05,50.63,19.08,53.12
300,0.6187,0.574223,1.69,58.2,0.1,55.6,75.46,51.46
400,0.7249,0.498308,1.4,30.9,0.0,50.0,29.13,53.94
500,0.643,0.475523,1.06,39.59,0.14,46.61,24.92,55.38
600,0.5045,0.475192,1.7,38.56,0.06,56.23,36.72,54.34
700,0.7065,0.509966,1.87,37.81,0.0,,42.78,52.66
800,0.5846,0.46077,1.35,21.82,0.88,50.54,17.3,56.59
900,0.5729,0.467708,1.16,24.87,0.0,100.0,18.21,57.44
1000,0.735,0.469905,1.38,25.93,4.13,63.69,17.26,58.26


***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
  'full trade accuracy': (soft_profit[abs_trade > .9] > 0).mean() * 100,
  ret = ret.dtype.type(ret / rcount)
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
Saving model checkpoint to ./res

TrainOutput(global_step=3270, training_loss=0.5552892037487905, metrics={'train_runtime': 1886.12, 'train_samples_per_second': 1.734, 'train_steps_per_second': 1.734, 'total_flos': 0.0, 'train_loss': 0.5552892037487905, 'epoch': 1.0})

In [None]:
# trainer.evaluate(eurusd['test'])

In [None]:
del trainer
del model
torch.cuda.empty_cache()

# Appendix

## failed idea: have the model go through a timewise curriculum of the data
The idea was that if the training didn't respect the timeseries nature of the data, then the model could "memorize" parts of the data and use that to predict past data better (which wouldn't be good at test time). Seemingly this isn't an issue as the model performs better on a validation set that does come from the future.

In [3]:
class TraderTrainer(Trainer):

    def get_train_dataloader(self) -> DataLoader:
        """
        Returns the training :class:`~torch.utils.data.DataLoader`.

        Will use no sampler if :obj:`self.train_dataset` does not implement :obj:`__len__`, a random sampler (adapted
        to distributed training if necessary) otherwise.

        Subclass and override this method if you want to inject some custom behavior.
        """
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")
        train_sampler = self._get_train_sampler()

        return DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
#             shuffle=False, # TO STOP OVERFITTING
            sampler=train_sampler,
            collate_fn=self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
        )

## make sure unshuffled split maintains order

In [16]:
foo = Dataset.from_dict({"input": list(range(100))})
split = foo.train_test_split(.1, shuffle = False)
valid_test = split['test'].train_test_split(.5, shuffle = False)
foo = DatasetDict({
    'train': split['train'],
    'validation': valid_test['train'],
    'test': valid_test['test']
})

In [20]:
foo['validation']['input'], foo['test']['input']

([90, 91, 92, 93, 94], [95, 96, 97, 98, 99])

## quick timing check

In [None]:
model = GPT2Trader(config).cuda()

In [72]:
%%timeit
fake_data = torch.randn(4, 391, 256)
fake_data = fake_data.cuda()
model(fake_data)
cpu = fake_data.cpu()

28.2 ms ± 2.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [73]:
model = GPT2Trader(config).cpu()

Using 9 layers


In [74]:
%%timeit
fake_data = torch.randn(4, 391, 256)
model(fake_data)

748 ms ± 82.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
