In [1]:
# this notebook will use a basic GPT based decision transformer in offline reinforcement learning setting to create bot for trading stock
# get cuda device
# import libraries
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

from cust_transf import DecisionTransformer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import numpy as np

#utility function to compute the discounted cumulative sum of a vector
def discount_cumsum(x, gamma):
    disc_cumsum = np.zeros_like(x)
    
    disc_cumsum[-1] = x[-1]
    
    for t in reversed(range(x.shape[0]-1)):
        
        disc_cumsum[t] = x[t] + gamma * disc_cumsum[t+1]
    return disc_cumsum

#utility function to evaluate the performance of the agent on a given environment
def evaluate(model, device, context_len, env, rtg_target, rtg_scale, 
            num_eval_ep=10, max_test_ep_len=28, state_mean=None, state_std=None, render=False):
    eval_batch_size = 1

    results = {}
    total_reward = 0
    total_length = 0

    state_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    state_mean = torch.zeros((state_dim,)).to(device) if state_mean is None else torch.from_numpy(state_mean).to(device)
    state_std = torch.ones((state_dim,)).to(device) if state_std is None else torch.from_numpy(state_std).to(device)
    
    # create timestep for transformer
    timesteps = torch.arange(start=0, end=max_test_ep_len, step=1)
    timesteps = timesteps.unsqueeze(0).repeat(eval_batch_size, 1).to(device)

    # evaluate the agent
    model.eval()
    with torch.no_grad():

        for _ in range(num_eval_ep):

            # zeros place holders
            actions = torch.zeros((eval_batch_size, max_test_ep_len, act_dim),
                                dtype=torch.float32, device=device)

            states = torch.zeros((eval_batch_size, max_test_ep_len, state_dim),
                                dtype=torch.float32, device=device)
            
            rewards_to_go = torch.zeros((eval_batch_size, max_test_ep_len, 1),
                                dtype=torch.float32, device=device)
            
            # init episode
            running_state = env.reset()
            running_reward = 0
            running_rtg = rtg_target / rtg_scale

            for t in range(max_test_ep_len):

                total_timesteps += 1

                # add state in placeholder and normalize
                states[0, t] = torch.from_numpy(running_state).to(device)
                states[0, t] = (states[0, t] - state_mean) / state_std

                # calcualate running rtg and add in placeholder
                running_rtg = running_rtg - (running_reward / rtg_scale)
                rewards_to_go[0, t] = running_rtg

                if t < context_len:
                    _, act_preds, _ = model.forward(timesteps[:,:context_len],
                                                states[:,:context_len],
                                                actions[:,:context_len],
                                                rewards_to_go[:,:context_len])
                    act = act_preds[0, t].detach()
                else:
                    _, act_preds, _ = model.forward(timesteps[:,t-context_len+1:t+1],
                                                states[:,t-context_len+1:t+1],
                                                actions[:,t-context_len+1:t+1],
                                                rewards_to_go[:,t-context_len+1:t+1])
                    act = act_preds[0, -1].detach()


                running_state, running_reward, done, _ = env.step(act.cpu().numpy())

                # add action in placeholder
                actions[0, t] = act

                total_reward += running_reward

                if render:
                    env.render()
                if done:
                    break

    results['eval/avg_reward'] = total_reward / num_eval_ep
    results['eval/avg_ep_len'] = total_timesteps / num_eval_ep
    
    return results




In [4]:
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
# define a custom dataset class which loads the data and modifies the reward to be the discounted cumulative sum
class CustomDataset(Dataset):
    def __init__(self, file_name, gamma):
        self.gamma = gamma

        # load the data
        data = load_dataset("json", data_files = file_name, field = 'data').with_format('torch')
        self.data = data['train']
        # calculate the mean and std of the state
        stateshape = data['train']['state'].shape
        allstates = data['train']['state'].reshape([stateshape[0]*stateshape[1], stateshape[2]])
        self.state_std, self.state_mean = torch.std_mean(allstates, dim = 0)
        

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = self.data[idx]
        # calculate the discounted cumulative sum
        discount_reward = discount_cumsum(data['reward'], self.gamma)
        #
        # get the state, action, reward and next state
        return (data['state']-self.state_mean)/self.state_std, data['action'], torch.from_numpy(discount_reward), data['timestep']


In [5]:
# load huggingface dataset from json file
filename = 'AAPL_2190_2016-01-01_1d_random_replaybuffer.json'

dataset = CustomDataset(filename, gamma = 0.99)

Using custom data configuration default-d6108ac762ae0e39
Found cached dataset json (/home/victoru/.cache/huggingface/datasets/json/default-d6108ac762ae0e39/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)
100%|██████████| 1/1 [00:00<00:00, 568.56it/s]


In [6]:

# define training parameters
batch_size = 2
lr = 1e-4
wt_decay = 1e-4
warmup_steps = 10000
n_epochs = 2

In [7]:
# create dataloader from dataset
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [8]:
# define model parameters
# sample 1 batch from dataloader
norm_state, actions, rtg, timestep = next(iter(dataloader))
# use batch shape to determine state dimension
state_dim = norm_state.shape[-1]
act_dim = actions.shape[-1] # discrete action space
# use batch shape to determine context length
context_len = timestep.shape[-1] # K in decision transformer

n_blocks = 3 # number of transformer blocks
h_dim = 96 # hidden dimension
n_heads = 3 # number of heads in multi-head attention
drop_p = 0.1 # dropout probability

#

In [9]:
# create the model
model = DecisionTransformer(state_dim, act_dim, n_blocks, h_dim, context_len, n_heads, drop_p).to(device)

# create optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wt_decay)

# create scheduler
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: min(1.0, (step + 1) / warmup_steps))

# create a GradScaler for mixed precision training
scaler = torch.cuda.amp.GradScaler()

# create a grad

In [12]:
# test run the model
with torch.no_grad():
    norm_state, actions, rtg, timestep = next(iter(dataloader))
    norm_state = norm_state.to(device)
    actions = actions.to(device)
    rtg = rtg.to(device)
    timestep = timestep.to(device)
    action_targets = torch.clone(actions).detach().to(device)
    return_preds, state_preds, act_preds = model.forward(norm_state, rtg, timestep, actions)

    # check shape of norm_state
    print(norm_state.shape)
    # check shape of rtg
    print(rtg.shape)
    # check shape of timestep
    print(timestep.shape)
    # check shape of actions
    print(actions.shape)
    print(act_preds.shape)
    print(action_targets.shape)
    
    # calculate losses just for actions
    act_preds = act_preds.view(-1, act_dim)
    action_targets = action_targets.view(-1, act_dim)

    # check shape of action targets
    print(action_targets.shape)
    # check shape of action predictions
    print(act_preds.shape)

# TODO: check for nan values and inf values in the input and the output

torch.Size([2, 1509, 14])
torch.Size([2, 1509, 1])
torch.Size([2, 1509])
torch.Size([2, 1509, 2])
torch.Size([2, 1509, 2])
torch.Size([2, 1509, 2])
torch.Size([3018, 2])
torch.Size([3018, 2])


In [14]:
print(act_preds)
print(action_targets)
print(norm_state)

tensor([[-0.6934, -0.1297],
        [-0.1443,  0.4099],
        [ 0.8372,  0.1880],
        ...,
        [-0.2587, -0.3713],
        [ 0.0776, -0.2321],
        [-0.2730, -0.6220]], device='cuda:0')
tensor([[2.6152, 0.1912],
        [0.3809, 0.6733],
        [1.6328, 0.9775],
        ...,
        [2.6875, 0.0536],
        [0.3257, 0.4534],
        [2.7578, 0.3606]], device='cuda:0')
tensor([[[-0.9650, -0.9705, -0.9736,  ..., -1.6203, -1.8992, -1.4272],
         [-0.9918, -0.9805, -0.9910,  ..., -1.6203, -1.8992, -1.4272],
         [-0.9727, -0.9776, -0.9885,  ..., -0.9699, -1.8992, -1.4272],
         ...,
         [ 2.6713,  2.6457,  2.7065,  ...,  2.5397,  2.0674,  2.7666],
         [ 2.7016,  2.7317,  2.7509,  ...,  2.5397,  2.0674,  2.7666],
         [ 2.7765,  2.7536,  2.7870,  ...,  2.5397,  2.0674,  2.7666]],

        [[-0.9650, -0.9705, -0.9736,  ..., -1.6203, -1.8992, -1.4272],
         [-0.9918, -0.9805, -0.9910,  ..., -0.9705, -1.8992, -1.4272],
         [-0.9727, -0.9776, -0

In [11]:
# create training loop
from tqdm import tqdm

# get the start time to calculate training time
import time
start_time = time.time()

for i in range(n_epochs):
    model.train()
    for norm_state, actions, rtg, timestep in tqdm(dataloader):
        # get batch data to device
        norm_state = norm_state.to(device)
        actions = actions.to(device)
        rtg = rtg.to(device)
        timestep = timestep.to(device)

        action_targets = torch.clone(actions).detach().to(device)

        # Zeroes out the gradients
        optimizer.zero_grad()

        # run forward pass with autocasting
        with torch.cuda.amp.autocast():
            return_preds, state_preds, act_preds = model.forward(norm_state, rtg, timestep, actions)

            # calculate losses just for actions
            act_preds = act_preds.view(-1, act_dim)
            action_targets = action_targets.view(-1, act_dim)

            loss = F.mse_loss(act_preds, action_targets, reduction='mean')

        # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
        scaler.scale(loss).backward()

        # Clips the gradients by norm
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)

        # scaler.step() first unscales the gradients of the optimizer's assigned params.
        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
        # otherwise, optimizer.step() is skipped.
        scaler.step(optimizer)

        # Updates the learning rate according to the scheduler
        scheduler.step()
        # Updates the scale for next iteration.
        scaler.update()
        
        # print loss
        if i % 10 == 0:
            print('loss: ', loss.item())

# record training time
trainingtime = time.time() - start_time
print('Training time: ', trainingtime)

# create environment to evaluate the model


  0%|          | 1/500 [00:01<13:18,  1.60s/it]

loss:  nan


  0%|          | 2/500 [00:03<13:18,  1.60s/it]

loss:  nan


  1%|          | 3/500 [00:04<13:15,  1.60s/it]

loss:  nan


  1%|          | 4/500 [00:06<13:09,  1.59s/it]

loss:  nan


  1%|          | 5/500 [00:07<13:07,  1.59s/it]

loss:  nan


  1%|          | 6/500 [00:09<13:06,  1.59s/it]

loss:  nan


  1%|▏         | 7/500 [00:11<13:05,  1.59s/it]

loss:  nan


  2%|▏         | 8/500 [00:12<13:04,  1.60s/it]

loss:  nan


  2%|▏         | 9/500 [00:14<12:57,  1.58s/it]

loss:  nan


  2%|▏         | 10/500 [00:15<12:57,  1.59s/it]

loss:  nan


  2%|▏         | 11/500 [00:17<12:58,  1.59s/it]

loss:  nan


  2%|▏         | 12/500 [00:19<12:57,  1.59s/it]

loss:  nan


  3%|▎         | 13/500 [00:20<12:58,  1.60s/it]

loss:  nan


  3%|▎         | 14/500 [00:22<12:57,  1.60s/it]

loss:  nan


  3%|▎         | 15/500 [00:23<12:55,  1.60s/it]

loss:  nan


  3%|▎         | 16/500 [00:25<12:48,  1.59s/it]

loss:  nan


  3%|▎         | 17/500 [00:27<12:44,  1.58s/it]

loss:  nan


  4%|▎         | 18/500 [00:28<12:42,  1.58s/it]

loss:  nan


  4%|▍         | 19/500 [00:30<12:39,  1.58s/it]

loss:  nan


  4%|▍         | 20/500 [00:31<12:39,  1.58s/it]

loss:  nan


  4%|▍         | 21/500 [00:33<12:39,  1.59s/it]

loss:  nan


  4%|▍         | 22/500 [00:34<12:39,  1.59s/it]

loss:  nan


  5%|▍         | 23/500 [00:36<12:37,  1.59s/it]

loss:  nan


  5%|▍         | 24/500 [00:38<12:40,  1.60s/it]

loss:  nan


  5%|▌         | 25/500 [00:39<12:39,  1.60s/it]

loss:  nan


  5%|▌         | 26/500 [00:41<12:38,  1.60s/it]

loss:  nan


  5%|▌         | 27/500 [00:43<12:39,  1.61s/it]

loss:  nan


  6%|▌         | 28/500 [00:44<12:35,  1.60s/it]

loss:  nan


  6%|▌         | 29/500 [00:46<12:43,  1.62s/it]

loss:  nan


  6%|▌         | 30/500 [00:47<12:37,  1.61s/it]

loss:  nan


  6%|▌         | 31/500 [00:49<12:32,  1.61s/it]

loss:  nan


  6%|▋         | 32/500 [00:51<12:32,  1.61s/it]

loss:  nan


  6%|▋         | 32/500 [00:52<12:50,  1.65s/it]


KeyboardInterrupt: 

In [None]:
# 

In [7]:
# test act_embedding
embed_act = nn.Embedding(act_dim+1, h_dim)
embed_timestep = nn.Embedding(4096, h_dim)
embed_rtg = nn.Linear(1, h_dim)
embed_state = nn.Linear(state_dim, h_dim)

In [50]:
print(act_dim)

1


In [8]:
batch = next(iter(dataloader))
actions = batch['action']
timesteps = batch['timestep']
reward = batch['reward']
states = batch['state']

In [21]:
print(actions.shape)
print(reward.shape)
print(timesteps.shape)
print(states.shape)
print(embed_act.weight.shape)
print(embed_rtg.weight.shape)
print(embed_timestep.weight.shape)
print(embed_state.weight.shape)

torch.Size([4, 28, 1])
torch.Size([4, 28, 1])
torch.Size([4, 28])
torch.Size([4, 28, 5])
torch.Size([1, 128])
torch.Size([128, 1])
torch.Size([4096, 128])
torch.Size([128, 5])


In [10]:
time_emb = embed_timestep(timesteps)
print(time_emb.shape)


torch.Size([4, 28, 128])


In [9]:
statesize = embed_state(states).shape
timesize = embed_timestep(timesteps).shape

# compare the size of the embedding
print(statesize == timesize)

True


In [11]:
states_emb = embed_state(states) + time_emb

In [57]:
reward_emb = embed_rtg(reward)

In [29]:
act_emb = torch.squeeze(embed_act(actions))
act_emb = act_emb + time_emb
