In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2Model, GPT2Config, PreTrainedModel
from sru import SRUpp
from datasets import load_dataset, Dataset, DatasetDict

from torch.utils.data import DataLoader
from transformers import Trainer, TrainingArguments

import math
import numpy as np

import logging
logging.disable(logging.INFO)

# MAIN GOAL: GET SETUP WITH PAPER TRADING AND POLYGON
First just manually get polygon data for maybe yesterday and format it for the model and get some prediction (in parallel at first, then try it sequentially)
- need an inference optional parameter that'll return early

# Competitive edge techniques:
- Custom loss function based on profit with trade sizing (more of a conceptual innovation)
- multiloss to integrate shorter and longer trades (practical innovation to unlock longer times and more data and better gradients)
- best models -- most people don't really understand how transformers even work let alone sru
- the way I normalize the data in preprocessing may be better?

- right now model is quite greedy (for my own sanity), but it's totally possible to consider more of a <50% accuracy model but just with higher upside
- the linear loss is odd to say the least in some sense having a strong bias for "opportunity cost"

TODO:
- trickier goal, do usd/jpy and usd/gbp, should they be integrated all at once though? (would be tricky to handle multiple datastreams) may leave this for later after setting up paper trading
    - would be fine to just use it as a transfer learning tool though
    - yeah could just simply harvest their data by concatenating them to the front of the dataset (and validation set would still be eurusd)

In [2]:
class SRUTrader(PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        
        config.initializer_range = 1 / math.sqrt(config.n_embd)
        
        self.embed = nn.Linear(5, config.n_embd, bias = False)
        self.position_embeddings = nn.Embedding(config.n_positions, config.n_embd)
        self.norm = nn.LayerNorm(config.n_embd)
        self.sru = SRUpp(input_size = config.n_embd,
                         hidden_size = config.n_embd,
                         proj_size = 4 * config.n_embd, # paper says 8 is better, but working with memory contraints
                         num_layers = 10, # paper seemed to have tuned to find this to work
                         dropout = .01,
                         attn_dropout = .01,
                         rescale = True,
                         layer_norm = True,
                         num_heads = config.n_head,
                         attention_every_n_layers = 2)
        self.trade = nn.Linear(config.n_embd, 120, bias = False)


    def forward(self, ohlcv, future):
        # manual positional embeddings
        batch_size, seq_length, _ = ohlcv.shape
        position_ids = torch.arange(seq_length, dtype=torch.long, device=ohlcv.device)
        position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
        position_embeddings = self.position_embeddings(position_ids)
        
        embed = self.norm(self.embed(ohlcv) + position_embeddings)
        embed = torch.permute(embed, (1, 0, 2)) # sequence first for SRU
        hidden = torch.permute(self.sru(embed)[0], (1, 0, 2))
        
        soft_trade = self.trade(hidden)
        
        # sharpe information
        soft_trade = torch.tanh(soft_trade)
        soft_profit = soft_trade * future
        
        # the exp is so that loss is purely positive and minimizes toward 0 (also losses have more loss than profit)
        loss_ppl = torch.square(((-soft_profit + future.abs()))).mean()
        
        # penalty for big trades (to stop trading from happening with no profit)
        trade_penalty = soft_trade.abs().mean()
        
        loss = loss_ppl + .1 * trade_penalty # .1 means that a 100% position must make at least .1 of a std to offset loss
        
        return {
            'loss': loss,
            'profits': soft_profit,
            'trades': soft_trade,
        }

In [3]:
def compute_metrics(preds):
    soft_profit, soft_trade = preds.predictions
    abs_trade = np.abs(soft_trade)
    trades = abs_trade.sum()
    
    day_profits = soft_profit.sum(axis = (1, 2))
    
    metrics = {
        'day sharpe': day_profits.mean() / day_profits.std(),
        'trade %': trades * 100 / soft_profit.size,
        
        'full trade %': (abs_trade >= .9).mean() * 100,
        'full trade accuracy': (soft_profit[abs_trade >= .9] > 0).mean() * 100,
        'full trade g/l': soft_profit[(abs_trade >= .9) & (soft_profit > 0)].mean()
                          / -soft_profit[(abs_trade >= .9) & (soft_profit < 0)].mean(),
        
        'medium trade %': ((abs_trade < .9) & (abs_trade > .5)).mean() * 100,
        'medium trade accuracy': (soft_profit[(abs_trade < .9) & (abs_trade > .5)] > 0).mean() * 100,
        'medium trade g/l': soft_profit[(abs_trade < .9) & (abs_trade > .5) & (soft_profit > 0)].mean()
                            / -soft_profit[(abs_trade < .9) & (abs_trade > .5) & (soft_profit < 0)].mean(),
    }
    
    # round the metrics
    metrics = {k: np.format_float_positional(v, precision = 2) for k, v in metrics.items()}
    
    return metrics

In [4]:
eurusd = Dataset.load_from_disk('data/EURUSD_day.ds')

# make splits
split = eurusd.train_test_split(.05, shuffle = False)
valid_test = split['test'].train_test_split(.5, shuffle = False)
eurusd = DatasetDict({
    'train': split['train'],
    'validation': valid_test['train'],
    'test': valid_test['test']
})

In [5]:
training_args = TrainingArguments(
    output_dir = "./results",
    logging_strategy = "steps",
    evaluation_strategy = "steps",
    logging_steps = 100,
    eval_steps = 100,
    report_to = "none",
    learning_rate = 1e-4,
    lr_scheduler_type = "cosine",
    warmup_ratio = .05,
    num_train_epochs = 1,
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    max_grad_norm = 1,
)

In [6]:
config = GPT2Config(
    n_embd = 320, n_positions = 2000, n_head = 5
)

In [7]:
model = SRUTrader(config)
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = eurusd['train'],
    eval_dataset = eurusd['validation'],
    compute_metrics = compute_metrics
)

In [8]:
# sru lr of 1e-4, hidden size 320, 5 heads
trainer.train()

***** Running training *****
  Num examples = 3270
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 3270
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Day sharpe,Trade %,Full trade %,Full trade accuracy,Full trade g/l,Medium trade %,Medium trade accuracy,Medium trade g/l
100,0.7984,0.573184,1.13,37.88,1.85,55.38,2.89,29.91,51.87,1.58
200,1.0569,0.514235,1.17,35.91,6.5,59.73,3.4,20.68,50.04,1.36
300,0.4328,0.44122,1.39,37.12,5.2,62.28,3.23,24.46,60.56,1.88
400,0.5265,0.341141,1.35,35.65,4.89,69.58,5.05,22.06,65.1,1.97
500,0.4726,0.325495,1.39,34.75,3.47,75.81,6.44,23.08,65.96,2.39
600,0.3262,0.312801,1.74,45.38,8.63,75.6,4.43,33.81,65.92,1.76
700,0.4463,0.310042,1.77,46.73,11.82,73.21,3.45,31.97,66.44,1.73
800,0.3787,0.323889,1.66,36.43,4.45,85.65,6.66,25.77,73.08,2.24
900,0.3346,0.25018,1.95,43.6,10.61,84.86,4.8,29.85,70.54,2.0
1000,0.3706,0.240688,1.92,42.35,10.56,83.97,4.88,27.72,72.53,2.06


***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json
Model weights saved in ./res

TrainOutput(global_step=3270, training_loss=0.2732733577763269, metrics={'train_runtime': 1551.6159, 'train_samples_per_second': 2.107, 'train_steps_per_second': 2.107, 'total_flos': 0.0, 'train_loss': 0.2732733577763269, 'epoch': 1.0})

In [9]:
trainer.evaluate(eurusd['test'])

***** Running Evaluation *****
  Num examples = 87
  Batch size = 1


{'eval_loss': 0.11858043819665909,
 'eval_day sharpe': '2.67',
 'eval_trade %': '52.93',
 'eval_full trade %': '19.35',
 'eval_full trade accuracy': '95.11',
 'eval_full trade g/l': '6.63',
 'eval_medium trade %': '32.86',
 'eval_medium trade accuracy': '89.90',
 'eval_medium trade g/l': '2.68',
 'eval_runtime': 17.1238,
 'eval_samples_per_second': 5.081,
 'eval_steps_per_second': 5.081,
 'epoch': 1.0}

In [13]:
soft_profit, soft_trade = trainer.predict(eurusd['validation']).predictions

***** Running Prediction *****
  Num examples = 86
  Batch size = 1


In [45]:
# full trade percent on 24 hours, ignoring last hour, it makes most trades in london and ny sessions (esp overlap)
(soft_trade > .9).mean(axis = (0, 2)).reshape(-1, 60).mean(axis = 1)

array([0.00249839, 0.01800549, 0.03796673, 0.07796512, 0.03417797,
       0.01741602, 0.02248062, 0.00574774, 0.04024386, 0.10422804,
       0.11214309, 0.12038921, 0.12152778, 0.11953973, 0.11725129,
       0.14244348, 0.14758236, 0.17693637, 0.14784884, 0.0772626 ,
       0.04732397, 0.04191053, 0.08761789, 0.26417474])

In [46]:
# full trade accuracy on 24 hours
np.nanmean(np.where(soft_trade >= .9, soft_profit > 0, np.nan), axis = (0, 2)).reshape(-1, 60).mean(axis = 1)

  np.nanmean(np.where(soft_trade >= .9, soft_profit > 0, np.nan), axis = (0, 2)).reshape(-1, 60).mean(axis = 1)


array([       nan, 0.98071417, 0.95146088, 0.99030557, 0.99806648,
       0.98027491, 0.97749938,        nan, 0.99290491, 0.99647212,
       0.99536351, 0.99196472, 0.99540797, 0.99237283, 0.99370732,
       0.99259902, 0.99310819, 0.99533136, 0.99444364, 0.99727577,
       0.99690271, 0.97558482, 0.76396494, 0.81481913])

In [16]:
# percent full trades on all timeframes
(soft_trade > .9).mean(axis = (0, 1))

array([0.00062984, 0.00098514, 0.00163114, 0.00394864, 0.00737242,
       0.01115149, 0.01350937, 0.01660207, 0.01845123, 0.02241602,
       0.02708333, 0.02858527, 0.03628068, 0.03681363, 0.03950258,
       0.04244186, 0.04646318, 0.05859981, 0.04962855, 0.04767442,
       0.05016957, 0.06216085, 0.05637112, 0.0620155 , 0.06960594,
       0.05977875, 0.07156008, 0.06825743, 0.07321544, 0.07039729,
       0.07865795, 0.07989341, 0.07865795, 0.07973191, 0.08538437,
       0.08732235, 0.08049903, 0.08968831, 0.08921996, 0.0912064 ,
       0.08502907, 0.0845365 , 0.09761789, 0.09752907, 0.08401163,
       0.09277293, 0.09887758, 0.0970365 , 0.10746932, 0.10379522,
       0.09903101, 0.09917636, 0.11875807, 0.09421835, 0.10753391,
       0.10094477, 0.09886143, 0.11103036, 0.10372255, 0.10862403,
       0.09933786, 0.08993863, 0.10190568, 0.1123385 , 0.11434916,
       0.09837694, 0.11026324, 0.09916828, 0.09843346, 0.10160691,
       0.10784884, 0.09940245, 0.10044412, 0.10874516, 0.10713

In [31]:
# full trade accuracy on all timeframes
np.nanmean(np.where(soft_trade > .9, soft_profit > 0, np.nan), axis = (0, 1))

array([0.57692308, 0.79508197, 0.74752475, 0.78936605, 0.83461117,
       0.79942071, 0.8475792 , 0.89396887, 0.89234136, 0.90706052,
       0.91234347, 0.91497175, 0.92343646, 0.93200263, 0.94582993,
       0.94387367, 0.94160584, 0.94708557, 0.94093719, 0.95240515,
       0.95364558, 0.95336451, 0.95745595, 0.95755208, 0.96183295,
       0.96528434, 0.96513202, 0.95646516, 0.95489136, 0.96295022,
       0.96294015, 0.96371538, 0.96427472, 0.96283168, 0.95867221,
       0.96467542, 0.96509178, 0.96668767, 0.96578876, 0.96644533,
       0.96980057, 0.96609036, 0.96947638, 0.96944858, 0.96837755,
       0.96744712, 0.97035525, 0.9703753 , 0.96934405, 0.97067061,
       0.96738421, 0.96604787, 0.96940233, 0.96588961, 0.96568296,
       0.96536277, 0.9664298 , 0.96465455, 0.96699105, 0.96966994,
       0.96610307, 0.96525409, 0.9663233 , 0.97211041, 0.97111786,
       0.96651071, 0.96235811, 0.96229949, 0.96423298, 0.96177382,
       0.96922731, 0.96458164, 0.9634215 , 0.96621371, 0.96065

In [32]:
# full trade accuracy on all timeframes (include 0) shockingly makes it about 100% accurate
np.nanmean(np.where(soft_trade > .9, soft_profit >= 0, np.nan), axis = (0, 1))

array([0.78205128, 0.91803279, 0.89108911, 0.87730061, 0.88061336,
       0.8602462 , 0.88882247, 0.92169261, 0.92122538, 0.93407781,
       0.94007156, 0.94463277, 0.94858669, 0.95459531, 0.96381848,
       0.96385084, 0.96159194, 0.9656883 , 0.96485519, 0.97323848,
       0.97392564, 0.97453884, 0.97679416, 0.97565104, 0.98027842,
       0.98514116, 0.98386369, 0.97823258, 0.97716996, 0.98336774,
       0.98439585, 0.98352537, 0.98531978, 0.98440348, 0.98099111,
       0.98529684, 0.98655833, 0.98775547, 0.98895828, 0.98902169,
       0.99059829, 0.98930175, 0.9887501 , 0.98998179, 0.99183007,
       0.99129602, 0.99028175, 0.99067987, 0.990683  , 0.99152015,
       0.99054142, 0.99096238, 0.99014075, 0.99357216, 0.99068859,
       0.99208063, 0.9933023 , 0.99156364, 0.99268198, 0.99435028,
       0.99504146, 0.99640869, 0.99627575, 0.99647786, 0.99576301,
       0.99720923, 0.99502014, 0.99568439, 0.99803117, 0.99547008,
       0.99730458, 0.99699431, 0.99598038, 0.99680701, 0.99457

In [27]:
# full trade profit on all timeframes
np.nanmean(np.where(soft_trade > .9, soft_profit, np.nan), axis = (0, 1))

array([7.272327 , 7.1456485, 5.4958863, 3.1692212, 2.350186 , 1.8899122,
       1.8555994, 1.8093182, 1.7990168, 1.6801865, 1.5942628, 1.5921887,
       1.4734232, 1.4953012, 1.4823666, 1.4641556, 1.414876 , 1.2949023,
       1.4083811, 1.4606198, 1.4457611, 1.3268101, 1.3967084, 1.35698  ,
       1.2948686, 1.3998835, 1.294979 , 1.3205708, 1.280796 , 1.3243719,
       1.2567165, 1.2532634, 1.2755669, 1.2672589, 1.2286527, 1.2305788,
       1.2767869, 1.2183542, 1.2173533, 1.2164502, 1.2627718, 1.2669466,
       1.199666 , 1.200153 , 1.2807001, 1.2337393, 1.2036744, 1.2152404,
       1.1645474, 1.1802653, 1.2096075, 1.2066774, 1.1184528, 1.2373323,
       1.1721324, 1.2106881, 1.2246623, 1.1632986, 1.2058973, 1.1839378,
       1.2311872, 1.2883145, 1.2189245, 1.1759139, 1.1639053, 1.251327 ,
       1.1859996, 1.2410449, 1.2556417, 1.2325155, 1.20284  , 1.2485127,
       1.2461162, 1.2091612, 1.2154613, 1.234623 , 1.2176992, 1.2074447,
       1.2080635, 1.2544149, 1.2632424, 1.1867762, 

In [33]:
trainer.save_model('srupp.model')

Saving model checkpoint to srupp.model
Configuration saved in srupp.model\config.json
Model weights saved in srupp.model\pytorch_model.bin


In [None]:
# del trainer
# del model
# torch.cuda.empty_cache()

# Appendix

## gpt2 experiements

In [None]:
class GPT2Trader(PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        
        # use levine 2020 layer numbers
        n_layer = round((math.log(config.n_embd) - 5.039) / 5.55e-2)
        n_layer = max(1, n_layer)
        print(f'Using {n_layer} layers')
        config.n_layer = n_layer
        
        config.initializer_range = 1 / math.sqrt(config.n_embd)
        
        self.embed = nn.Linear(5, config.n_embd, bias = False)
        self.norm = nn.LayerNorm(config.n_embd)
        self.gpt = GPT2Model(config)
        self.trade = nn.Linear(config.n_embd, 120, bias = False)


    def forward(self, ohlcv, future):
        embed = self.norm(self.embed(ohlcv))
        hidden = self.gpt(inputs_embeds = embed).last_hidden_state
        
        soft_trade = self.trade(hidden)
        
        # sharpe information
        soft_trade = torch.tanh(soft_trade)
        soft_profit = soft_trade * future
        
        # the exp is so that loss is purely positive and minimizes toward 0 (also losses have more loss than profit)
        loss_ppl = torch.square(((-soft_profit + future.abs()))).mean()
        
        # penalty for big trades (to stop trading from happening with no profit)
        trade_penalty = soft_trade.abs().mean()
        
        loss = loss_ppl + .1 * trade_penalty # .1 means that a 100% position must make at least .1 of a std to offset loss
        
        return {
            'loss': loss,
            'profits': soft_profit,
            'trades': soft_trade,
        }

In [None]:
config = GPT2Config(
    n_embd = 384, n_head = 6, vocab_size = 0, n_positions = 2000,
    resid_pdrop = .01, embd_pdrop = .01, attn_pdrop = .01, # low dropout since only using 1 epoch training and to make model more robust to data issues (.1 has worse loss, accuracy & t-score)
    summary_first_dropout = 0, summary_proj_to_labels = False,
    scale_attn_by_inverse_layer_idx = True, use_cache = False
)
model = GPT2Trader(config)

In [52]:
# 5e-4 lr (best)
trainer.train()

***** Running training *****
  Num examples = 3270
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 3270
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Day sharpe,Trade %,Full trade %,Full trade accuracy,Medium trade %,Medium trade accuracy
100,0.7391,0.576555,1.18,55.53,0.0,,68.1,52.68
200,1.0387,0.508294,1.57,25.15,0.04,53.85,19.25,56.31
300,0.5464,0.482877,1.6,33.05,0.02,43.69,32.39,53.77
400,0.6911,0.527884,1.36,42.9,1.1,61.84,34.89,54.52
500,0.6392,0.453005,1.28,27.61,0.0,,19.7,57.28
600,0.4925,0.488201,1.82,38.68,0.0,,34.53,54.87
700,0.6549,0.472404,1.43,36.18,0.0,50.99,31.99,54.76
800,0.5556,0.428319,1.41,21.04,0.0,42.86,17.26,58.88
900,0.5543,0.445764,1.26,25.83,0.16,52.64,15.78,59.76
1000,0.6621,0.395642,1.37,19.72,4.11,77.08,8.08,68.22


***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
  'full trade accuracy': (soft_profit[abs_trade > .9] > 0).mean() * 100,
  ret = ret.dtype.type(ret / rcount)
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
  'full trade accuracy': (soft_profit[abs_trade > .9] > 0).mean() * 100,
  ret = ret.dtype.type(ret / rcount)
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch

TrainOutput(global_step=3270, training_loss=0.5385216050920866, metrics={'train_runtime': 1883.0698, 'train_samples_per_second': 1.737, 'train_steps_per_second': 1.737, 'total_flos': 0.0, 'train_loss': 0.5385216050920866, 'epoch': 1.0})

In [48]:
# 1e-3 lr
trainer.train()

***** Running training *****
  Num examples = 3270
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 3270
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Day sharpe,Trade %,Full trade %,Full trade accuracy,Medium trade %,Medium trade accuracy
100,0.7428,0.59886,1.4,53.69,0.34,52.07,58.1,52.67
200,1.0593,0.629185,1.19,35.43,0.05,50.63,19.08,53.12
300,0.6187,0.574223,1.69,58.2,0.1,55.6,75.46,51.46
400,0.7249,0.498308,1.4,30.9,0.0,50.0,29.13,53.94
500,0.643,0.475523,1.06,39.59,0.14,46.61,24.92,55.38
600,0.5045,0.475192,1.7,38.56,0.06,56.23,36.72,54.34
700,0.7065,0.509966,1.87,37.81,0.0,,42.78,52.66
800,0.5846,0.46077,1.35,21.82,0.88,50.54,17.3,56.59
900,0.5729,0.467708,1.16,24.87,0.0,100.0,18.21,57.44
1000,0.735,0.469905,1.38,25.93,4.13,63.69,17.26,58.26


***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
  'full trade accuracy': (soft_profit[abs_trade > .9] > 0).mean() * 100,
  ret = ret.dtype.type(ret / rcount)
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
Saving model checkpoint to ./res

TrainOutput(global_step=3270, training_loss=0.5552892037487905, metrics={'train_runtime': 1886.12, 'train_samples_per_second': 1.734, 'train_steps_per_second': 1.734, 'total_flos': 0.0, 'train_loss': 0.5552892037487905, 'epoch': 1.0})

In [56]:
# 2e-4 lr
trainer.train()

***** Running training *****
  Num examples = 3270
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 3270
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Day sharpe,Trade %,Full trade %,Full trade accuracy,Medium trade %,Medium trade accuracy
100,0.7406,0.527701,2.02,38.95,0.09,52.46,32.52,53.95
200,0.977,0.492953,1.67,26.88,0.09,59.97,21.8,56.01
300,0.5394,0.543064,1.8,29.15,0.0,100.0,27.0,52.78
400,0.6701,0.463039,1.55,29.99,0.0,20.0,27.0,55.51
500,0.6254,0.451693,1.31,31.58,0.0,0.0,25.62,55.81
600,0.4804,0.496943,1.92,42.34,0.0,,41.84,53.82
700,0.6605,0.484283,1.5,28.9,0.01,51.19,26.33,56.26
800,0.5631,0.44198,1.36,22.58,0.0,,20.0,56.21
900,0.5655,0.439754,1.29,24.82,0.0,50.0,18.22,57.8
1000,0.7105,0.449362,1.41,22.11,0.04,65.72,17.88,58.45


***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
  'full trade accuracy': (soft_profit[abs_trade > .9] > 0).mean() * 100,
  ret = ret.dtype.type(ret / rcount)
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
Saving model checkpoint to ./res

TrainOutput(global_step=3270, training_loss=0.5430762673007603, metrics={'train_runtime': 1879.3917, 'train_samples_per_second': 1.74, 'train_steps_per_second': 1.74, 'total_flos': 0.0, 'train_loss': 0.5430762673007603, 'epoch': 1.0})

In [12]:
# 5e-5 (really small) learning rate
trainer.train()

***** Running training *****
  Num examples = 3270
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 3270
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Day sharpe,Trade %,Full trade %,Full trade accuracy,Full trade g/l,Medium trade %,Medium trade accuracy,Medium trade g/l
100,0.7951,0.519946,1.84,39.76,2.33,58.8,3.04,32.59,53.53,1.7
200,0.9255,0.442862,1.63,32.52,1.65,91.58,11.87,21.13,54.21,1.9
300,0.4277,0.386134,1.55,28.43,3.78,83.79,6.39,12.35,57.08,1.61
400,0.5192,0.33999,1.73,33.33,3.96,77.52,5.39,19.27,64.25,2.0
500,0.4715,0.342008,1.55,35.43,3.3,90.87,8.5,24.02,61.75,1.97
600,0.3335,0.302813,1.78,33.35,2.7,94.64,11.23,21.55,68.82,2.33
700,0.4295,0.301962,1.89,43.58,5.76,83.92,5.94,34.55,66.01,1.69
800,0.3861,0.267107,1.93,32.97,2.4,98.0,13.71,21.87,75.97,3.15
900,0.3504,0.242964,1.78,39.49,4.62,93.88,6.49,30.09,72.65,2.22
1000,0.3862,0.247526,1.71,39.8,4.67,94.88,8.3,30.43,70.33,2.26


***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json
Model weights saved in ./res

TrainOutput(global_step=3270, training_loss=0.28452607805211244, metrics={'train_runtime': 1779.4527, 'train_samples_per_second': 1.838, 'train_steps_per_second': 1.838, 'total_flos': 0.0, 'train_loss': 0.28452607805211244, 'epoch': 1.0})

## SRU experiments

In [14]:
# sru lr of 1e-4 (6 heads) hidden size 384
trainer.train()

***** Running training *****
  Num examples = 3270
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 3270
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Day sharpe,Trade %,Full trade %,Full trade accuracy,Full trade g/l,Medium trade %,Medium trade accuracy,Medium trade g/l
100,0.7583,0.511011,1.72,37.6,2.08,61.44,3.55,29.25,54.22,1.73
200,0.936,0.450914,1.78,34.24,5.61,74.47,4.59,19.33,51.92,1.31
300,0.3917,0.400445,1.44,31.94,2.58,87.24,6.43,19.15,62.35,2.06
400,0.4812,0.309347,1.76,36.83,4.74,86.57,7.62,24.7,66.72,1.82
500,0.4362,0.313799,1.72,37.91,5.8,80.42,4.82,25.1,66.08,1.9
600,0.3087,0.283673,1.77,41.22,6.75,82.12,4.94,29.24,68.34,1.84
700,0.3856,0.271688,2.04,45.27,10.61,83.02,4.89,31.36,67.65,1.78
800,0.3302,0.243528,2.02,38.33,5.81,94.43,8.94,26.28,74.4,2.47
900,0.3342,0.227438,1.99,41.24,5.84,95.1,6.96,31.4,75.0,2.44
1000,0.3207,0.209981,2.06,44.56,11.33,91.65,5.9,29.89,73.83,2.03


***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json
Model weights saved in ./res

TrainOutput(global_step=3270, training_loss=0.249264092401627, metrics={'train_runtime': 1777.0102, 'train_samples_per_second': 1.84, 'train_steps_per_second': 1.84, 'total_flos': 0.0, 'train_loss': 0.249264092401627, 'epoch': 1.0})

In [12]:
# sru lr of 1e-4 (no heads)
trainer.train()

***** Running training *****
  Num examples = 3270
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 3270
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Day sharpe,Trade %,Full trade %,Full trade accuracy,Full trade g/l,Medium trade %,Medium trade accuracy,Medium trade g/l
100,0.7496,0.504964,1.74,37.87,2.01,62.5,3.88,29.78,54.1,1.74
200,0.9467,0.419118,1.74,29.66,2.32,96.17,12.28,15.68,55.41,1.86
300,0.3706,0.404617,1.5,35.97,3.06,90.69,6.27,25.53,58.72,1.57
400,0.5042,0.334729,1.87,32.85,4.92,80.6,5.81,17.72,65.83,1.76
500,0.4351,0.300377,1.57,32.84,3.11,90.46,9.01,19.43,69.63,2.38
600,0.3256,0.272262,1.77,38.33,3.9,94.61,10.48,28.19,69.28,2.23
700,0.407,0.28363,2.09,49.07,10.02,83.74,5.37,38.35,65.55,1.67
800,0.3391,0.254286,2.1,39.59,6.25,84.63,5.12,27.66,73.59,2.37
900,0.3531,0.24066,1.91,39.95,6.46,93.23,7.41,29.04,74.37,2.34
1000,0.3936,0.211937,2.19,41.89,7.95,90.82,6.8,29.96,76.9,2.17


***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json
Model weights saved in ./res

TrainOutput(global_step=3270, training_loss=0.26571082660546724, metrics={'train_runtime': 1763.507, 'train_samples_per_second': 1.854, 'train_steps_per_second': 1.854, 'total_flos': 0.0, 'train_loss': 0.26571082660546724, 'epoch': 1.0})

In [9]:
# sru lr of 5e-4
trainer.train()

***** Running training *****
  Num examples = 3270
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 3270
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Day sharpe,Trade %,Full trade %,Full trade accuracy,Medium trade %,Medium trade accuracy
100,0.707,0.511279,1.23,37.33,4.36,66.84,30.01,56.53
200,1.1172,0.573373,1.47,58.55,24.83,54.93,59.16,52.02
300,0.4527,0.496984,2.11,49.29,18.54,60.35,45.26,56.11
400,0.5377,0.499412,1.34,56.95,26.6,61.27,55.62,58.81
500,0.5142,0.305154,1.43,31.76,5.19,80.69,20.34,68.63
600,0.3887,0.3318,1.54,27.54,6.15,78.1,13.94,70.29
700,0.4762,0.40442,1.88,59.19,18.03,67.74,62.29,61.7
800,0.422,0.292924,1.46,32.75,4.56,84.47,21.5,72.27
900,0.4076,0.298788,1.46,35.89,4.53,83.85,26.33,69.7
1000,0.4934,0.427605,1.49,30.62,8.22,61.58,22.05,63.34


***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json
Model weights saved in ./res

TrainOutput(global_step=3270, training_loss=0.4054160027693536, metrics={'train_runtime': 1613.905, 'train_samples_per_second': 2.026, 'train_steps_per_second': 2.026, 'total_flos': 0.0, 'train_loss': 0.4054160027693536, 'epoch': 1.0})

In [8]:
# sru lr of 3e-4 (recommended by sru paper)
trainer.train()

***** Running training *****
  Num examples = 3270
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 3270
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Day sharpe,Trade %,Full trade %,Full trade accuracy,Full trade g/l,Medium trade %,Medium trade accuracy,Medium trade g/l
100,0.7156,0.474766,1.47,34.31,2.3,74.83,5.03,23.51,54.96,1.76
200,1.0027,0.397932,1.35,36.01,4.6,70.16,4.66,22.5,59.05,1.92
300,0.4077,0.576816,1.4,64.61,21.53,63.24,2.31,49.73,51.53,1.02
400,0.6076,0.396596,1.37,41.82,6.82,80.57,6.11,30.14,58.76,1.43
500,0.4823,0.416988,1.38,45.03,10.01,74.58,3.96,32.45,58.6,1.4
600,0.3492,0.275632,1.74,36.19,6.52,82.49,7.51,21.84,69.87,2.14
700,0.4503,0.41137,2.02,44.86,17.14,67.97,2.47,22.69,59.69,1.37
800,0.4962,0.31168,1.64,31.82,4.78,82.2,7.52,15.6,71.5,2.61
900,0.438,0.337457,1.98,30.07,6.52,78.31,4.99,13.66,65.76,1.46
1000,0.4921,0.32147,1.77,34.01,5.06,76.34,5.85,20.93,67.87,1.69


***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json
Model weights saved in ./res

TrainOutput(global_step=3270, training_loss=0.37757016569831686, metrics={'train_runtime': 1725.6916, 'train_samples_per_second': 1.895, 'train_steps_per_second': 1.895, 'total_flos': 0.0, 'train_loss': 0.37757016569831686, 'epoch': 1.0})

In [8]:
# sru lr of 1e-4 no embedding layer norm
trainer.train()

***** Running training *****
  Num examples = 3270
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 3270
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Day sharpe,Trade %,Full trade %,Full trade accuracy,Full trade g/l,Medium trade %,Medium trade accuracy,Medium trade g/l
100,0.7822,0.507575,1.69,38.27,2.82,61.22,3.51,29.37,54.15,1.67
200,0.9126,0.412638,1.7,29.22,3.39,87.24,8.66,13.84,55.5,1.68
300,0.3751,0.37114,1.52,29.61,3.99,87.84,6.89,14.14,61.09,1.59
400,0.5199,0.322033,1.59,30.84,1.79,95.99,12.13,17.96,70.68,2.92
500,0.4432,0.309579,1.59,37.17,3.13,92.69,8.45,27.85,65.61,2.37
600,0.3128,0.306958,1.75,46.3,6.57,83.6,6.39,38.12,63.18,1.7
700,0.4281,0.289558,1.79,39.36,4.67,87.35,6.59,29.44,70.26,1.79
800,0.3685,0.289121,1.82,38.88,6.51,84.92,4.81,27.71,71.16,2.11
900,0.3393,0.257594,1.8,42.79,8.24,88.87,4.83,31.8,69.48,2.14
1000,0.3458,0.247257,1.94,45.57,12.59,85.36,4.14,29.79,70.93,1.96


***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
***** Running Evaluation *****
  Num examples = 86
  Batch size = 1
Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json
Model weights saved in ./res

TrainOutput(global_step=3270, training_loss=0.2766398222803705, metrics={'train_runtime': 1805.5267, 'train_samples_per_second': 1.811, 'train_steps_per_second': 1.811, 'total_flos': 0.0, 'train_loss': 0.2766398222803705, 'epoch': 1.0})

## failed idea: have the model go through a timewise curriculum of the data
The idea was that if the training didn't respect the timeseries nature of the data, then the model could "memorize" parts of the data and use that to predict past data better (which wouldn't be good at test time). Seemingly this isn't an issue as the model performs better on a validation set that does come from the future.

In [3]:
class TraderTrainer(Trainer):

    def get_train_dataloader(self) -> DataLoader:
        """
        Returns the training :class:`~torch.utils.data.DataLoader`.

        Will use no sampler if :obj:`self.train_dataset` does not implement :obj:`__len__`, a random sampler (adapted
        to distributed training if necessary) otherwise.

        Subclass and override this method if you want to inject some custom behavior.
        """
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")
        train_sampler = self._get_train_sampler()

        return DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
#             shuffle=False, # TO STOP OVERFITTING
            sampler=train_sampler,
            collate_fn=self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
        )

## make sure unshuffled split maintains order

In [16]:
foo = Dataset.from_dict({"input": list(range(100))})
split = foo.train_test_split(.1, shuffle = False)
valid_test = split['test'].train_test_split(.5, shuffle = False)
foo = DatasetDict({
    'train': split['train'],
    'validation': valid_test['train'],
    'test': valid_test['test']
})

In [20]:
foo['validation']['input'], foo['test']['input']

([90, 91, 92, 93, 94], [95, 96, 97, 98, 99])

## quick timing check

In [None]:
model = GPT2Trader(config).cuda()

In [72]:
%%timeit
fake_data = torch.randn(4, 391, 256)
fake_data = fake_data.cuda()
model(fake_data)
cpu = fake_data.cpu()

28.2 ms ± 2.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [73]:
model = GPT2Trader(config).cpu()

Using 9 layers


In [74]:
%%timeit
fake_data = torch.randn(4, 391, 256)
model(fake_data)

748 ms ± 82.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
