In [1]:
# this notebook will use a basic GPT based decision transformer in offline reinforcement learning setting to create bot for trading stock
# get cuda device
# import libraries
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

from cust_transf import DecisionTransformer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import numpy as np

# utility function to compute the discounted cumulative sum of a vector
def discount_cumsum(x, gamma):
    disc_cumsum = np.zeros_like(x)
    
    disc_cumsum[-1] = x[-1]
    
    for t in reversed(range(x.shape[0]-1)):
        
        disc_cumsum[t] = x[t] + gamma * disc_cumsum[t+1]
    return disc_cumsum

# utility function to evaluate the performance of the agent on a given environment
# TODO: change it to work with custom environment
def evaluate(model, device, context_len, env, rtg_target, rtg_scale, 
            num_eval_ep=10, max_test_ep_len=28, state_mean=None, state_std=None, render=False):
    eval_batch_size = 1

    results = {}
    total_reward = 0
    total_length = 0

    state_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    state_mean = torch.zeros((state_dim,)).to(device) if state_mean is None else torch.from_numpy(state_mean).to(device)
    state_std = torch.ones((state_dim,)).to(device) if state_std is None else torch.from_numpy(state_std).to(device)
    
    # create timestep for transformer
    timesteps = torch.arange(start=0, end=max_test_ep_len, step=1)
    timesteps = timesteps.unsqueeze(0).repeat(eval_batch_size, 1).to(device)

    # evaluate the agent
    model.eval()
    with torch.no_grad():

        for _ in range(num_eval_ep):

            # zeros place holders
            actions = torch.zeros((eval_batch_size, max_test_ep_len, act_dim),
                                dtype=torch.float32, device=device)

            states = torch.zeros((eval_batch_size, max_test_ep_len, state_dim),
                                dtype=torch.float32, device=device)
            
            rewards_to_go = torch.zeros((eval_batch_size, max_test_ep_len, 1),
                                dtype=torch.float32, device=device)
            
            # init episode
            running_state = env.reset()
            running_reward = 0
            running_rtg = rtg_target / rtg_scale

            for t in range(max_test_ep_len):

                total_timesteps += 1

                # add state in placeholder and normalize
                states[0, t] = torch.from_numpy(running_state).to(device)
                states[0, t] = (states[0, t] - state_mean) / state_std

                # calcualate running rtg and add in placeholder
                running_rtg = running_rtg - (running_reward / rtg_scale)
                rewards_to_go[0, t] = running_rtg

                if t < context_len:
                    _, act_preds, _ = model.forward(timesteps[:,:context_len],
                                                states[:,:context_len],
                                                actions[:,:context_len],
                                                rewards_to_go[:,:context_len])
                    act = act_preds[0, t].detach()
                else:
                    _, act_preds, _ = model.forward(timesteps[:,t-context_len+1:t+1],
                                                states[:,t-context_len+1:t+1],
                                                actions[:,t-context_len+1:t+1],
                                                rewards_to_go[:,t-context_len+1:t+1])
                    act = act_preds[0, -1].detach()


                running_state, running_reward, done, _ = env.step(act.cpu().numpy())

                # add action in placeholder
                actions[0, t] = act

                total_reward += running_reward

                if render:
                    env.render()
                if done:
                    break

    results['eval/avg_reward'] = total_reward / num_eval_ep
    results['eval/avg_ep_len'] = total_timesteps / num_eval_ep
    
    return results




In [3]:
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
# define a custom dataset class which loads the data and modifies the reward to be the discounted cumulative sum
class CustomDataset(Dataset):
    def __init__(self, file_name, gamma):
        self.gamma = gamma

        # load the data
        data = load_dataset("json", data_files = file_name, field = 'data').with_format('torch')
        self.data = data['train']
        # calculate the mean and std of the state
        stateshape = data['train']['state'].shape
        allstates = data['train']['state'].reshape([stateshape[0]*stateshape[1], stateshape[2]])
        self.state_std, self.state_mean = torch.std_mean(allstates, dim = 0)
        

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = self.data[idx]
        # calculate the discounted cumulative sum
        discount_reward = discount_cumsum(data['reward'], self.gamma)
        #
        # get the state, action, reward and next state
        return (data['state']-self.state_mean)/self.state_std, data['action'], torch.from_numpy(discount_reward), data['timestep']


In [4]:
# load huggingface dataset from json file
filename = 'AAPL_2190_2016-01-01_1d_random_replaybuffer.json'

dataset = CustomDataset(filename, gamma = 0.99)

Using custom data configuration default-3a69897addd28b22
Found cached dataset json (/home/victoru/.cache/huggingface/datasets/json/default-3a69897addd28b22/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)
100%|██████████| 1/1 [00:00<00:00, 822.25it/s]


In [5]:
# define training parameters
batch_size = 2
lr = 3e-5
wt_decay = 1e-4
warmup_steps = 10000
n_epochs = 2

In [6]:
# create dataloader from dataset
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [7]:
# define model parameters
# sample 1 batch from dataloader
norm_state, actions, rtg, timestep = next(iter(dataloader))
# use batch shape to determine state dimension
state_dim = norm_state.shape[-1]
act_dim = actions.shape[-1] # discrete action space
# use batch shape to determine context length
context_len = timestep.shape[-1] # K in decision transformer

n_blocks = 3 # number of transformer blocks
h_dim = 96 # hidden dimension
n_heads = 3 # number of heads in multi-head attention
drop_p = 0.1 # dropout probability


In [8]:
# create the model
model = DecisionTransformer(state_dim, act_dim, n_blocks, h_dim, context_len, n_heads, drop_p).to(device)

# create optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wt_decay)

# create scheduler
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: min(1.0, (step + 1) / warmup_steps))

# create a GradScaler for mixed precision training
scaler = torch.cuda.amp.GradScaler()

In [10]:
# test run the model
with torch.no_grad():
    norm_state, actions, rtg, timestep = next(iter(dataloader))
    norm_state = norm_state.to(device)
    actions = actions.to(device)
    rtg = rtg.to(device)
    timestep = timestep.to(device)
    action_targets = torch.clone(actions).detach().to(device)
    return_preds, state_preds, act_preds = model.forward(norm_state, rtg, timestep, actions)

    # check shape of norm_state
    print(norm_state.shape)
    # check shape of rtg
    print(rtg.shape)
    # check shape of timestep
    print(timestep.shape)
    # check shape of actions
    print(actions.shape)
    print(act_preds.shape)
    print(action_targets.shape)
    
    # calculate losses just for actions
    act_preds = act_preds.view(-1, act_dim)
    action_targets = action_targets.view(-1, act_dim)

    # check shape of action targets
    print(action_targets.shape)
    # check shape of action predictions
    print(act_preds.shape)

# check for nan values and inf values in the input and the output of the model
print(torch.isnan(norm_state).any())
print(torch.isnan(rtg).any())
print(torch.isnan(timestep).any())
print(torch.isnan(actions).any())
print(torch.isnan(act_preds).any())
print(torch.isnan(action_targets).any())



torch.Size([4, 1509, 13])
torch.Size([4, 1509, 1])
torch.Size([4, 1509])
torch.Size([4, 1509, 2])
torch.Size([4, 1509, 2])
torch.Size([4, 1509, 2])
torch.Size([6036, 2])
torch.Size([6036, 2])
tensor(False, device='cuda:0')
tensor(False, device='cuda:0')
tensor(False, device='cuda:0')
tensor(False, device='cuda:0')
tensor(False, device='cuda:0')
tensor(False, device='cuda:0')


In [11]:
print(act_preds)
print(action_targets)
print(norm_state)

tensor([[-0.1709,  0.2726],
        [-0.3959,  0.1280],
        [-0.4118, -0.5362],
        ...,
        [ 0.4393,  0.5826],
        [ 0.5670, -0.2079],
        [-0.0441,  0.1552]], device='cuda:0')
tensor([[ 0.0221,  0.4075],
        [-0.9087,  0.1251],
        [-0.4329,  0.0531],
        ...,
        [ 0.0811,  0.2015],
        [ 0.5435,  0.6152],
        [-0.1350,  0.5347]], device='cuda:0')
tensor([[[-0.9650, -0.9705, -0.9736,  ..., -1.5698, -1.8749, -1.4157],
         [-0.9918, -0.9805, -0.9910,  ..., -1.5698, -1.8749, -1.4157],
         [-0.9727, -0.9776, -0.9885,  ..., -1.5698, -1.8747, -1.4156],
         ...,
         [ 2.6713,  2.6457,  2.7065,  ...,  2.5513,  1.3809,  1.9503],
         [ 2.7016,  2.7317,  2.7509,  ...,  2.5545,  1.3809,  1.9503],
         [ 2.7765,  2.7536,  2.7870,  ...,  2.5952,  1.3809,  1.9503]],

        [[-0.9650, -0.9705, -0.9736,  ..., -1.5698, -1.8749, -1.4157],
         [-0.9918, -0.9805, -0.9910,  ..., -1.5698, -1.8749, -1.4157],
         [-0.9727,

In [9]:
# create training loop
from tqdm import tqdm

# get the start time to calculate training time
import datetime
start_time = datetime.datetime.now()
#training_log = {'epoch':[], 'loss':[], 'eval/avg_reward':[], 'eval/avg_ep_len':[]} # training log

for i in range(n_epochs):
    model.train()
    log_action_losses = []
    for norm_state, actions, rtg, timestep in tqdm(dataloader):
        # get batch data to device
        norm_state = norm_state.to(device)
        actions = actions.to(device)
        rtg = rtg.to(device)
        timestep = timestep.to(device)

        action_targets = torch.clone(actions).detach().to(device)

        # Zeroes out the gradients
        optimizer.zero_grad()

        # run forward pass with autocasting
        with torch.cuda.amp.autocast(enabled=False):
            return_preds, state_preds, act_preds = model.forward(norm_state, rtg, timestep, actions)

            # calculate losses just for actions
            act_preds = act_preds.view(-1, act_dim)
            action_targets = action_targets.view(-1, act_dim)

            loss = F.mse_loss(act_preds, action_targets, reduction='mean')

        # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
        scaler.scale(loss).backward()

        # Clips the gradients by norm
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)

        # scaler.step() first unscales the gradients of the optimizer's assigned params.
        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
        # otherwise, optimizer.step() is skipped.
        scaler.step(optimizer)

        # Updates the learning rate according to the scheduler
        scheduler.step()
        # Updates the scale for next iteration.
        scaler.update()
        
        # append action loss to log
        log_action_losses.append(loss.detach().cpu().item())
        # print every 100 loss log
        if len(log_action_losses) % 100 == 0 or len(log_action_losses) == 1:
            print('Loss: ', log_action_losses[-1])
    

# record training time
end_time = datetime.datetime.now()
print('Training time: ', end_time - start_time)

# create environment to evaluate the model


  0%|          | 1/500 [00:02<16:52,  2.03s/it]

Loss:  0.4037802219390869


 20%|██        | 100/500 [02:43<10:46,  1.62s/it]

Loss:  0.3969181478023529


 40%|████      | 200/500 [05:28<08:14,  1.65s/it]

Loss:  0.4006848931312561


 60%|██████    | 300/500 [08:13<05:30,  1.65s/it]

Loss:  0.3894937038421631


 80%|████████  | 400/500 [10:58<02:41,  1.62s/it]

Loss:  0.38199514150619507


100%|██████████| 500/500 [13:39<00:00,  1.64s/it]


Loss:  0.35910943150520325


  0%|          | 1/500 [00:01<13:28,  1.62s/it]

Loss:  0.36109912395477295


 20%|██        | 100/500 [02:40<10:42,  1.61s/it]

Loss:  0.3591246008872986


 40%|████      | 200/500 [05:20<07:55,  1.59s/it]

Loss:  0.3461839258670807


 60%|██████    | 300/500 [08:00<05:19,  1.60s/it]

Loss:  0.33066847920417786


 80%|████████  | 400/500 [10:40<02:40,  1.60s/it]

Loss:  0.3286522626876831


100%|██████████| 500/500 [13:19<00:00,  1.60s/it]

Loss:  0.32495033740997314
Training time:  0:26:59.402146





In [None]:
# 

In [7]:
# test act_embedding
embed_act = nn.Embedding(act_dim+1, h_dim)
embed_timestep = nn.Embedding(4096, h_dim)
embed_rtg = nn.Linear(1, h_dim)
embed_state = nn.Linear(state_dim, h_dim)

In [50]:
print(act_dim)

1


In [8]:
batch = next(iter(dataloader))
actions = batch['action']
timesteps = batch['timestep']
reward = batch['reward']
states = batch['state']

In [21]:
print(actions.shape)
print(reward.shape)
print(timesteps.shape)
print(states.shape)
print(embed_act.weight.shape)
print(embed_rtg.weight.shape)
print(embed_timestep.weight.shape)
print(embed_state.weight.shape)

torch.Size([4, 28, 1])
torch.Size([4, 28, 1])
torch.Size([4, 28])
torch.Size([4, 28, 5])
torch.Size([1, 128])
torch.Size([128, 1])
torch.Size([4096, 128])
torch.Size([128, 5])


In [10]:
time_emb = embed_timestep(timesteps)
print(time_emb.shape)


torch.Size([4, 28, 128])


In [9]:
statesize = embed_state(states).shape
timesize = embed_timestep(timesteps).shape

# compare the size of the embedding
print(statesize == timesize)

True


In [11]:
states_emb = embed_state(states) + time_emb

In [57]:
reward_emb = embed_rtg(reward)

In [29]:
act_emb = torch.squeeze(embed_act(actions))
act_emb = act_emb + time_emb
