In [1]:
# this notebook will use a basic GPT based decision transformer in offline reinforcement learning setting to create bot for trading stock
# get cuda device
# import libraries
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

from cust_transf import DecisionTransformer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets.load import load_dataset

# load huggingface dataset from json file in replaybuffer folder
foldername = 'replaybuffer'

# get filenames in folder
import os
filenames = os.listdir(foldername)

# get full path of files
full_filenames = [os.path.join(foldername, filename) for filename in filenames]

data = load_dataset("json", data_files = full_filenames[3], field = 'data')

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset json (/home/victoru/.cache/huggingface/datasets/json/default-27a7716f5eae9a12/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
100%|██████████| 1/1 [00:00<00:00, 78.72it/s]


In [7]:
len(data['train']['state'][2])

167

In [13]:
from datasets.load import load_dataset
from torch.utils.data import Dataset, DataLoader
import numpy as np

# utility function to compute the discounted cumulative sum of a vector
def discount_cumsum(x, gamma):
    disc_cumsum = np.zeros_like(x)
    disc_cumsum[-1] = x[-1]
    for t in reversed(range(x.shape[0]-1)):
        disc_cumsum[t] = x[t] + gamma * disc_cumsum[t+1]
    return disc_cumsum

# define a custom dataset class which loads the data, modifies the reward to be the discounted cumulative sum and apply trajectory masking
class CustomTrajTrainDataset(Dataset):
    def __init__(self, file_name, context_len, gamma, rtg_scale):
        self.gamma = gamma
        self.context_len = context_len

        # load the data
        # TODO: change the data loading to accomodate inhomoegeneous data (different length of trajectories)
        self.data = load_dataset("json", data_files = file_name, field = 'data')['train']
        """
        self.data_state = np.array(data['train']['state'], dtype=np.float32)
        self.data_action = np.array(data['train']['action'], dtype=np.float32)
        self.rtg = np.array(data['train']['reward'], dtype=np.float32)
        """
        self.rtg = []
        min_len = 10**6
        states = []
        for traj in self.data['state']:
            states.append(np.array(traj, dtype=np.float32))
            if len(traj) < min_len:
                min_len = len(traj)

        for reward in self.data['reward']:
            self.rtg.append(discount_cumsum(np.array(reward, dtype=np.float32), 1.0)/rtg_scale)

        states = np.concatenate(states, axis=0)
        self.state_mean, self.state_std = np.mean(states, axis=0), np.std(states, axis=0) + 1e-6

        # calculate min len, the mean and std of the state and rtg for all data
        self.stateshape = len(states)
        # calculate mean of state and rtg with numpy

        #self.state_mean = torch.mean(data['train']['state'], dim=(-2,-1), keepdim=True)
        #self.state_std = torch.std(data['train']['state'], dim=(-2,-1), keepdim=True)
        #self.norm_state = (self.data_state - self.state_mean) / self.state_std

        #self.rtg = np.apply_along_axis(discount_cumsum, 1, self.data['reward'], self.gamma) # type: ignore
        #self.rtg = self.rtg / rtg_scale

    def get_state_stats(self):
        return self.state_mean, self.state_std        

    def __len__(self):
        return self.stateshape

    def __getitem__(self, idx):

        state = np.array(self.data['state'][idx])
        action = np.array(self.data['action'][idx])
        rtg = np.array(self.rtg[idx])

        data_len = len(state)
        
        if data_len > self.context_len:
            # sample random start index
            start_idx = np.random.randint(0, data_len - self.context_len)
            # slice the data and convert to torch
            state = torch.from_numpy(state[start_idx:start_idx+self.context_len])
            action = torch.from_numpy(action[start_idx:start_idx+self.context_len])
            rtg = torch.from_numpy(rtg[start_idx:start_idx+self.context_len])
            timesteps = torch.arange(start=start_idx, end=start_idx + self.context_len, step=1)
            # trajectory mask
            mask = torch.ones(self.context_len, dtype=torch.long)
        else:
            padding_len = self.context_len - data_len

            # pad the data with zeros
            state = torch.from_numpy(state)
            state = torch.cat([state, torch.zeros((padding_len, *state.shape[1:]))], dim=0)

            action = torch.from_numpy(action)
            action = torch.cat([action, torch.zeros((padding_len, *action.shape[1:]))], dim=0)

            rtg = torch.from_numpy(rtg)
            rtg = torch.cat([rtg, torch.zeros((padding_len, *rtg.shape[1:]))], dim=0)

            timesteps = torch.arange(start=0, end=self.context_len, step=1)

            # trajectory mask
            mask = torch.cat([torch.ones(data_len, dtype=torch.long), torch.zeros(padding_len, dtype=torch.long)], dim=0)
        
        return state, action, rtg, timesteps, mask


In [14]:
# load huggingface dataset from json file in replaybuffer folder
foldername = 'replaybuffer'

# get filenames in folder
import os
filenames = os.listdir(foldername)

# get full path of files
full_filenames = [os.path.join(foldername, filename) for filename in filenames]

# create datasets and store in list from the list of filenames 
context_len = 30
Max_balance = 2147483647
gamma = 0.99

datasets = []
for name in full_filenames:
    dataset = CustomTrajTrainDataset(name, context_len, gamma, Max_balance)
    datasets.append(dataset)

# concatenate all datasets
combined_dataset = torch.utils.data.ConcatDataset(datasets)

Found cached dataset json (/home/victoru/.cache/huggingface/datasets/json/default-b92f90190ec4f8ad/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
100%|██████████| 1/1 [00:00<00:00, 1159.93it/s]
Found cached dataset json (/home/victoru/.cache/huggingface/datasets/json/default-c1a820023dacf4a9/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
100%|██████████| 1/1 [00:00<00:00, 1183.49it/s]
Found cached dataset json (/home/victoru/.cache/huggingface/datasets/json/default-335e91cc2e824a43/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
100%|██████████| 1/1 [00:00<00:00, 1032.83it/s]
Found cached dataset json (/home/victoru/.cache/huggingface/datasets/json/default-27a7716f5eae9a12/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
100%|██████████| 1/1 [00:00<00:00, 868.39it/s]
Found cached dataset json (/home/victoru/.cache/huggingface/datasets/json/default-4c76c05d1d4da50b/0.0.0/e347ab1c932092252e71

In [22]:
dataset.rtg[0]

array([[1.12477164e-06],
       [1.11948305e-06],
       [1.10898952e-06],
       [1.09364692e-06],
       [1.07318829e-06],
       [1.04726178e-06],
       [1.01497267e-06],
       [9.77262176e-07],
       [9.35211063e-07],
       [8.87489762e-07],
       [8.34665173e-07],
       [7.78111826e-07],
       [7.16890043e-07],
       [6.48865467e-07],
       [5.75051331e-07],
       [4.93915820e-07],
       [4.06925295e-07],
       [3.14399813e-07],
       [2.19857370e-07],
       [1.14428488e-07]])

In [23]:
# loop through the dataset and find the highest and lowest reward
max_reward = -math.inf
min_reward = math.inf
for dataset in datasets:
    for rtg in dataset.rtg:
        max_reward = max(max_reward, rtg.max())
        min_reward = min(min_reward, rtg.min())


print("max reward: ", max_reward)
print("min reward: ", min_reward)

max reward:  0.021209774548751198
min reward:  -2.6889867411926037e-07


In [5]:
# define training parameters
batch_size = 32
# small learning rate to try to avoid mixed precision caused NaNs
lr = 3e-5
wt_decay = 1e-4
warmup_steps = 10000
n_epochs = 500

In [6]:
# create dataloader from the concatenated dataset
dataloader = DataLoader(combined_dataset, batch_size=batch_size, shuffle=True, num_workers=2)

In [7]:
# define model parameters
# sample 1 batch from dataloader
norm_state, actions, rtg, timestep, traj_mask = next(iter(dataloader))
# use batch shape to determine state dimension
state_dim = norm_state.shape[-1]
act_dim = actions.shape[-1] # discrete action space
# use batch shape to determine context length


n_blocks = 4 # number of transformer blocks
h_dim = 96 # hidden dimension
n_heads = 8 # number of heads in multi-head attention
drop_p = 0.1 # dropout probability


In [8]:
# create the model
model = DecisionTransformer(state_dim, act_dim, n_blocks, h_dim, context_len, n_heads, drop_p).to(device)

# create optimizer
# use larger eps to try to avoid mixed precision overflow caused NaNs
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wt_decay, eps=1e-6)

# create scheduler
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: min(1.0, (step + 1) / warmup_steps))

# create a GradScaler for mixed precision training
scaler = torch.cuda.amp.GradScaler(growth_interval=150)
min_scale = 128

In [13]:
# get the model parameters size
n_params = sum([p.numel() for p in model.parameters() if p.requires_grad])
print(f"Number of parameters: {n_params}")

Number of parameters: 844144


In [8]:
# test run the model
with torch.no_grad():
    norm_state, actions, rtg, timestep , traj_mask= next(iter(dataloader))
    norm_state = norm_state.to(device)
    actions = actions.to(device)
    # convert rtg to float
    rtg = rtg.to(device).float()
    timestep = timestep.to(device)
    traj_mask = traj_mask.to(device)
    action_targets = torch.clone(actions).detach().to(device)
    return_preds, state_preds, act_preds = model.forward(norm_state, rtg, timestep, actions)

    # check shape of norm_state
    print(f"shape norm_state: {norm_state.shape}")
    # check shape of rtg
    print(f"shape rtg: {rtg.shape}")
    # check shape of timestep
    print(f"shape timestep: {timestep.shape}")
    # check shape of actions
    print(f"shape actions: {actions.shape}")
    print(f"shape act_preds: {act_preds.shape}")
    print(f"shape action_targets: {action_targets.shape}")
    
    # consider only the action that are padded
    act_preds = act_preds.view(-1, act_dim)[traj_mask.view(-1) > 0]
    action_targets = action_targets.view(-1, act_dim)[traj_mask.view(-1) > 0]

    # check shape of action targets
    print(action_targets.shape)
    # check shape of action predictions
    print(act_preds.shape)

# check for nan values and inf values in the input and the output of the model
print(torch.isnan(norm_state).any())
print(torch.isnan(rtg).any())
print(torch.isnan(timestep).any())
print(torch.isnan(actions).any())
print(torch.isnan(act_preds).any())
print(torch.isnan(action_targets).any())



shape norm_state: torch.Size([32, 20, 13])
shape rtg: torch.Size([32, 20, 1])
shape timestep: torch.Size([32, 20])
shape actions: torch.Size([32, 20, 2])
shape act_preds: torch.Size([32, 20, 2])
shape action_targets: torch.Size([32, 20, 2])
torch.Size([640, 2])
torch.Size([640, 2])
tensor(False, device='cuda:0')
tensor(False, device='cuda:0')
tensor(False, device='cuda:0')
tensor(False, device='cuda:0')
tensor(False, device='cuda:0')
tensor(False, device='cuda:0')


In [9]:
from tqdm import tqdm

# get the start time to calculate training time
import datetime

# custom training function which take in the model, dataset, optimizer, scheduler, scaler, n_epochs, min_scale
def train_model(model, dataloader, optimizer, scheduler, scaler, n_epochs, min_scale):

    # record the start time
    start_time = datetime.datetime.now()

    # define training parameters
    log_action_losses = []

    # train model
    for epoch in range(n_epochs):
        model.train()
        

        for norm_state, actions, rtg, timestep, traj_mask in tqdm(dataloader):
            # get batch data to device
            norm_state = norm_state.to(device)
            actions = actions.to(device)
            rtg = rtg.to(device).float()
            timestep = timestep.to(device)
            traj_mask = traj_mask.to(device)

            action_targets = torch.clone(actions).detach().to(device)

            # Zeroes out the gradients
            optimizer.zero_grad()

            # run forward pass with autocasting
            # disable autocasting for now to avoid mixed precision caused NaNs
            with torch.cuda.amp.autocast(enabled=False):
                _, _, act_preds = model.forward(norm_state, rtg, timestep, actions)

                # consider only the action that are padded
                act_preds = act_preds.view(-1, act_dim)[traj_mask.view(-1) > 0]
                action_targets = action_targets.view(-1, act_dim)[traj_mask.view(-1) > 0]

                # calculate losses just for actions
                loss = F.mse_loss(act_preds, action_targets, reduction='mean')

            # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
            scaler.scale(loss).backward()

            # unscale the gradients
            scaler.unscale_(optimizer)
            # Clips the gradients by norm
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)

            # scaler.step() first unscales the gradients of the optimizer's assigned params.
            # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
            # otherwise, optimizer.step() is skipped.
            scaler.step(optimizer)

            # Updates the learning rate according to the scheduler
            scheduler.step()
            # Updates the scale for next iteration.
            scaler.update()
            # enforce min scale to avoid mixed precision caused NaNs
            if scaler.get_scale() < min_scale:
                scaler._scale = torch.tensor(min_scale).to(scaler._scale)
        
            # append action loss to log
            log_action_losses.append(loss.detach().cpu().item())

        # print every 10 loss log
        if epoch % 100 == 0 or epoch == n_epochs - 1:
            print(f'Epoch {epoch}: Loss: {log_action_losses[-1]}')

    # record the end time
    end_time = datetime.datetime.now()
    print(f'Training time: {end_time - start_time}')
    
    return model, log_action_losses

In [10]:
# train model on each dataloader and store log_action_losses in a list
_, log_action_losses = train_model(model, dataloader, optimizer, scheduler, scaler, n_epochs, min_scale)


100%|██████████| 91/91 [00:01<00:00, 49.82it/s]


Epoch 0: Loss: 0.587468147277832


100%|██████████| 91/91 [00:01<00:00, 67.29it/s]
100%|██████████| 91/91 [00:01<00:00, 67.02it/s]
100%|██████████| 91/91 [00:01<00:00, 67.35it/s]
100%|██████████| 91/91 [00:01<00:00, 65.63it/s]
100%|██████████| 91/91 [00:01<00:00, 62.14it/s]
100%|██████████| 91/91 [00:01<00:00, 65.66it/s]
100%|██████████| 91/91 [00:01<00:00, 65.10it/s]
100%|██████████| 91/91 [00:01<00:00, 64.49it/s]
100%|██████████| 91/91 [00:01<00:00, 66.49it/s]
100%|██████████| 91/91 [00:01<00:00, 68.66it/s]
100%|██████████| 91/91 [00:01<00:00, 67.34it/s]
100%|██████████| 91/91 [00:01<00:00, 65.82it/s]
100%|██████████| 91/91 [00:01<00:00, 66.44it/s]
100%|██████████| 91/91 [00:01<00:00, 66.32it/s]
100%|██████████| 91/91 [00:01<00:00, 66.84it/s]
100%|██████████| 91/91 [00:01<00:00, 66.19it/s]
100%|██████████| 91/91 [00:01<00:00, 67.03it/s]
100%|██████████| 91/91 [00:01<00:00, 66.67it/s]
100%|██████████| 91/91 [00:01<00:00, 67.57it/s]
100%|██████████| 91/91 [00:01<00:00, 66.93it/s]
100%|██████████| 91/91 [00:01<00:00, 66.

Epoch 100: Loss: 0.13367286324501038


100%|██████████| 91/91 [00:01<00:00, 67.53it/s]
100%|██████████| 91/91 [00:01<00:00, 64.62it/s]
100%|██████████| 91/91 [00:01<00:00, 67.88it/s]
100%|██████████| 91/91 [00:01<00:00, 68.20it/s]
100%|██████████| 91/91 [00:01<00:00, 63.27it/s]
100%|██████████| 91/91 [00:01<00:00, 64.72it/s]
100%|██████████| 91/91 [00:01<00:00, 66.26it/s]
100%|██████████| 91/91 [00:01<00:00, 65.18it/s]
100%|██████████| 91/91 [00:01<00:00, 68.11it/s]
100%|██████████| 91/91 [00:01<00:00, 68.64it/s]
100%|██████████| 91/91 [00:01<00:00, 66.76it/s]
100%|██████████| 91/91 [00:01<00:00, 67.84it/s]
100%|██████████| 91/91 [00:01<00:00, 68.94it/s]
100%|██████████| 91/91 [00:01<00:00, 68.33it/s]
100%|██████████| 91/91 [00:01<00:00, 68.09it/s]
100%|██████████| 91/91 [00:01<00:00, 67.46it/s]
100%|██████████| 91/91 [00:01<00:00, 67.35it/s]
100%|██████████| 91/91 [00:01<00:00, 67.79it/s]
100%|██████████| 91/91 [00:01<00:00, 65.89it/s]
100%|██████████| 91/91 [00:01<00:00, 66.28it/s]
100%|██████████| 91/91 [00:01<00:00, 67.

Epoch 200: Loss: 0.2000335305929184


100%|██████████| 91/91 [00:01<00:00, 81.01it/s]
100%|██████████| 91/91 [00:01<00:00, 81.95it/s]
100%|██████████| 91/91 [00:01<00:00, 80.79it/s]
100%|██████████| 91/91 [00:01<00:00, 81.08it/s]
100%|██████████| 91/91 [00:01<00:00, 81.42it/s]
100%|██████████| 91/91 [00:01<00:00, 82.55it/s]
100%|██████████| 91/91 [00:01<00:00, 81.71it/s]
100%|██████████| 91/91 [00:01<00:00, 82.61it/s]
100%|██████████| 91/91 [00:01<00:00, 81.10it/s]
100%|██████████| 91/91 [00:01<00:00, 85.44it/s]
100%|██████████| 91/91 [00:01<00:00, 78.89it/s]
100%|██████████| 91/91 [00:01<00:00, 84.57it/s]
100%|██████████| 91/91 [00:01<00:00, 79.75it/s]
100%|██████████| 91/91 [00:01<00:00, 80.02it/s]
100%|██████████| 91/91 [00:01<00:00, 85.84it/s]
100%|██████████| 91/91 [00:01<00:00, 82.36it/s]
100%|██████████| 91/91 [00:01<00:00, 80.27it/s]
100%|██████████| 91/91 [00:01<00:00, 84.09it/s]
100%|██████████| 91/91 [00:01<00:00, 83.92it/s]
100%|██████████| 91/91 [00:01<00:00, 84.56it/s]
100%|██████████| 91/91 [00:01<00:00, 82.

Epoch 300: Loss: 0.1423427313566208


100%|██████████| 91/91 [00:01<00:00, 82.54it/s]
100%|██████████| 91/91 [00:01<00:00, 83.30it/s]
100%|██████████| 91/91 [00:01<00:00, 81.23it/s]
100%|██████████| 91/91 [00:01<00:00, 80.34it/s]
100%|██████████| 91/91 [00:01<00:00, 84.66it/s]
100%|██████████| 91/91 [00:01<00:00, 82.23it/s]
100%|██████████| 91/91 [00:01<00:00, 87.00it/s]
100%|██████████| 91/91 [00:01<00:00, 85.66it/s]
100%|██████████| 91/91 [00:01<00:00, 86.61it/s]
100%|██████████| 91/91 [00:01<00:00, 86.64it/s]
100%|██████████| 91/91 [00:01<00:00, 88.01it/s]
100%|██████████| 91/91 [00:01<00:00, 87.53it/s]
100%|██████████| 91/91 [00:01<00:00, 88.26it/s]
100%|██████████| 91/91 [00:01<00:00, 86.05it/s]
100%|██████████| 91/91 [00:01<00:00, 86.20it/s]
100%|██████████| 91/91 [00:01<00:00, 85.86it/s]
100%|██████████| 91/91 [00:01<00:00, 88.62it/s]
100%|██████████| 91/91 [00:01<00:00, 82.94it/s]
100%|██████████| 91/91 [00:01<00:00, 85.97it/s]
100%|██████████| 91/91 [00:01<00:00, 81.59it/s]
100%|██████████| 91/91 [00:01<00:00, 86.

Epoch 400: Loss: 0.2025459110736847


100%|██████████| 91/91 [00:01<00:00, 81.48it/s]
100%|██████████| 91/91 [00:01<00:00, 80.81it/s]
100%|██████████| 91/91 [00:01<00:00, 81.76it/s]
100%|██████████| 91/91 [00:01<00:00, 81.64it/s]
100%|██████████| 91/91 [00:01<00:00, 81.69it/s]
100%|██████████| 91/91 [00:01<00:00, 82.82it/s]
100%|██████████| 91/91 [00:01<00:00, 79.31it/s]
100%|██████████| 91/91 [00:01<00:00, 80.14it/s]
100%|██████████| 91/91 [00:01<00:00, 81.01it/s]
100%|██████████| 91/91 [00:01<00:00, 82.20it/s]
100%|██████████| 91/91 [00:01<00:00, 84.57it/s]
100%|██████████| 91/91 [00:01<00:00, 83.01it/s]
100%|██████████| 91/91 [00:01<00:00, 82.29it/s]
100%|██████████| 91/91 [00:01<00:00, 83.76it/s]
100%|██████████| 91/91 [00:01<00:00, 82.48it/s]
100%|██████████| 91/91 [00:01<00:00, 85.54it/s]
100%|██████████| 91/91 [00:01<00:00, 82.13it/s]
100%|██████████| 91/91 [00:01<00:00, 83.21it/s]
100%|██████████| 91/91 [00:01<00:00, 83.90it/s]
100%|██████████| 91/91 [00:01<00:00, 82.94it/s]
100%|██████████| 91/91 [00:01<00:00, 84.

Epoch 499: Loss: 0.19449269771575928
Training time: 0:09:55.818934





In [None]:
# test run the model
model.eval()
with torch.no_grad():
    

In [11]:
import json

# save model using torch.save() and save it to a directory
directory = 'model'
model_name = 'AAPL_model.pt'
if not os.path.exists(directory):
     os.makedirs(directory)
torch.save(model.state_dict(), os.path.join(directory, model_name))

# write model parameters to a json file

model_params = {
     'state_dim': state_dim,
     'act_dim': act_dim,
     'n_blocks': n_blocks,
     'h_dim': h_dim,
     'context_len': context_len,
     'n_heads': n_heads,
     'drop_p': drop_p,
}
with open(os.path.join(directory, 'AAPL_model_params.json'), 'w') as f:
     json.dump(model_params, f)