In [1]:
# this notebook will use a basic GPT based decision transformer in offline reinforcement learning setting to create bot for trading stock
# get cuda device
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
# import libraries
import math

import torch.nn as nn
import torch.nn.functional as F

# based on https://github.com/nikhilbarhate99/min-decision-transformer/blob/master/decision_transformer/model.py

# define the masked causal attention
class MaskedAttention(nn.Module):
    def __init__(self, h_dim, max_T, n_heads, drop_p):
        super().__init__()
        self.n_heads = n_heads
        self.drop_p = drop_p
        # feed forward networks which create the query, key and value
        self.Q_net = nn.Linear(h_dim, h_dim)
        self.K_net = nn.Linear(h_dim, h_dim)
        self.V_net = nn.Linear(h_dim, h_dim)

        # feed forward network which projects the attention to the correct dimension
        self.proj_net = nn.Linear(h_dim, h_dim)

        # dropout layers
        self.att_drop = nn.Dropout(drop_p)
        self.proj_drop = nn.Dropout(drop_p)

        # create the mask
        mask = torch.tril(torch.ones(max_T, max_T)).view(1, 1, max_T, max_T)

        # register_buffer will make the mask a constant tensor
        # so that it will not be included in the model parameters and be updated during backpropagation
        self.register_buffer('mask', mask)

    def forward(self, x):
        # x: [B, T, H]
        B, T, C = x.shape # batch size, sequence length, hidden dimension * number of heads
        N, D = self.n_heads, C // self.n_heads # number of heads, dimension of each head

        # compute the query, key and value
        Q = self.Q_net(x).view(B, T, N, D).transpose(1, 2) # [B, N, T, D]
        K = self.K_net(x).view(B, T, N, D).transpose(1, 2)
        V = self.V_net(x).view(B, T, N, D).transpose(1, 2)

        # compute the attention
        weights = Q @ K.transpose(2,3) / math.sqrt(D) # QK^T / sqrt(D)
        weights = weights.masked_fill(self.mask[:, :, :T, :T] == 0, float('-inf')) # mask the future tokens
        normalized_weights = F.softmax(weights, dim=-1) # softmax along the last dimension
        A = self.att_drop(normalized_weights @ V) # attention with dropout

        # compute the output
        # gather heads and project to correct dimension
        attention = A.transpose(1, 2).contiguous().view(B, T, N*D)
        out = self.proj_drop(self.proj_net(attention))

        return out

# define the attention block with layer normalization and residual connection as well as the feed forward network
class AttentionBlock(nn.Module):
    def __init__(self, h_dim, max_T, n_heads, drop_p):
        super().__init__()
        self.attention = MaskedAttention(h_dim, max_T, n_heads, drop_p)
        self.norm1 = nn.LayerNorm(h_dim)
        self.norm2 = nn.LayerNorm(h_dim)
        self.ffn = nn.Sequential(
            nn.Linear(h_dim, 4*h_dim),
            nn.GELU(),
            nn.Linear(4*h_dim, h_dim),
            nn.Dropout(drop_p)
        )

    def forward(self, x):
        # x: [B, T, H]
        # Attention -> LayerNorm -> Residual -> FFN -> LayerNorm -> Residual
        out = self.norm1(x + self.attention(x))
        out = self.norm2(out + self.ffn(out))

        return out

# define the decision transformer
class DecisionTransformer(nn.Module):
    def __init__(self, state_dim, act_dim, n_block, h_dim, context_len, n_heads, drop_p, max_timestep = 4096):
        super().__init__()
        self.state_dim = state_dim
        self.act_dim = act_dim
        self.h_dim = h_dim

        # transformer blocks
        input_seq_len = 3 * context_len
        blocks = [AttentionBlock(h_dim, input_seq_len, n_heads, drop_p) for _ in range(n_block)]
        self.transformer = nn.Sequential(*blocks)

        # projection heads (project to embedding)
        self.embed_ln = nn.LayerNorm(h_dim)
        self.embed_timestep = nn.Embedding(max_timestep, h_dim)
        self.embed_rtg = nn.Linear(1, h_dim)
        self.embed_state = nn.Linear(state_dim, h_dim)

        # discrete actions
        #self.embed_act = torch.nn.Embedding(act_dim+1, h_dim)
        #use_action_tah = False # for discrete action

        # continuous actions
        self.embed_act = nn.Linear(act_dim, h_dim)
        use_action_tah = True # for continuous action

        # prediction heads
        self.pred_rtg = nn.Linear(h_dim, 1)
        self.pred_state = nn.Linear(h_dim, state_dim)
        self.pred_act = nn.Sequential(*([nn.Linear(h_dim, act_dim)] + ([nn.Tanh()] if use_action_tah else [])))
    
    def forward(self, state, rtg, timestep, actions):
        B, T, _ = state.shape

        # timestep embedding
        time_emb = self.embed_timestep(timestep)

        # embedding for the state, reward and actions along with time embedding
        state_emb = self.embed_state(state) + time_emb
        rtg_emb = self.embed_rtg(rtg) + time_emb
        act_emb = self.embed_act(actions)
        if act_emb.shape != time_emb.shape:
            act_emb = torch.squeeze(act_emb) # fix the unmatch dimension
        act_emb = act_emb + time_emb

        # stack the embeddings and reshape sequence as (r1, s1, a1, r2, s2, a2, ...)
        h = torch.stack([rtg_emb, state_emb, act_emb], dim=1).permute(0,2,1,3).reshape(B, 3*T, self.h_dim)
        h = self.embed_ln(h)

        # transformer blocks
        h = self.transformer(h)

        # get h reshaped such that its size is (B, 3, T, h_dim) and
        # h[:, 0, t] is conditioned on r_0, s_0, a_0, ..., r_t
        # h[:, 1, t] is conditioned on r_0, s_0, a_0, ..., r_t, s_t
        # h[:, 2, t] is conditioned on r_0, s_0, a_0, ..., r_t, s_t, a_t
        h = h.reshape(B, T, 3, self.h_dim).permute(0,2,1,3)

        # get predictions
        return_preds = self.pred_rtg(h[:,2])    # predict next rtg given r, s, a
        state_preds = self.pred_state(h[:,2])   # predict next state given r, s, a
        act_preds = self.pred_act(h[:,1])       # predict action given r, s

        return return_preds, state_preds, act_preds

In [2]:
import numpy as np

#utility function to compute the discounted cumulative sum of a vector
def discount_cumsum(x, gamma):
    disc_cumsum = np.zeros_like(x)
    
    disc_cumsum[-1] = x[-1]
    
    for t in reversed(range(x.shape[0]-1)):
        
        disc_cumsum[t] = x[t] + gamma * disc_cumsum[t+1]
    return disc_cumsum

#utility function to evaluate the performance of the agent on a given environment
def evaluate(model, device, context_len, env, rtg_target, rtg_scale, 
            num_eval_ep=10, max_test_ep_len=28, state_mean=None, state_std=None, render=False):
    eval_batch_size = 1

    results = {}
    total_reward = 0
    total_length = 0

    state_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    state_mean = torch.zeros((state_dim,)).to(device) if state_mean is None else torch.from_numpy(state_mean).to(device)
    state_std = torch.ones((state_dim,)).to(device) if state_std is None else torch.from_numpy(state_std).to(device)
    
    # create timestep for transformer
    timesteps = torch.arange(start=0, end=max_test_ep_len, step=1)
    timesteps = timesteps.unsqueeze(0).repeat(eval_batch_size, 1).to(device)

    # evaluate the agent
    model.eval()
    with torch.no_grad():

        for _ in range(num_eval_ep):

            # zeros place holders
            actions = torch.zeros((eval_batch_size, max_test_ep_len, act_dim),
                                dtype=torch.float32, device=device)

            states = torch.zeros((eval_batch_size, max_test_ep_len, state_dim),
                                dtype=torch.float32, device=device)
            
            rewards_to_go = torch.zeros((eval_batch_size, max_test_ep_len, 1),
                                dtype=torch.float32, device=device)
            
            # init episode
            running_state = env.reset()
            running_reward = 0
            running_rtg = rtg_target / rtg_scale

            for t in range(max_test_ep_len):

                total_timesteps += 1

                # add state in placeholder and normalize
                states[0, t] = torch.from_numpy(running_state).to(device)
                states[0, t] = (states[0, t] - state_mean) / state_std

                # calcualate running rtg and add in placeholder
                running_rtg = running_rtg - (running_reward / rtg_scale)
                rewards_to_go[0, t] = running_rtg

                if t < context_len:
                    _, act_preds, _ = model.forward(timesteps[:,:context_len],
                                                states[:,:context_len],
                                                actions[:,:context_len],
                                                rewards_to_go[:,:context_len])
                    act = act_preds[0, t].detach()
                else:
                    _, act_preds, _ = model.forward(timesteps[:,t-context_len+1:t+1],
                                                states[:,t-context_len+1:t+1],
                                                actions[:,t-context_len+1:t+1],
                                                rewards_to_go[:,t-context_len+1:t+1])
                    act = act_preds[0, -1].detach()


                running_state, running_reward, done, _ = env.step(act.cpu().numpy())

                # add action in placeholder
                actions[0, t] = act

                total_reward += running_reward

                if render:
                    env.render()
                if done:
                    break

    results['eval/avg_reward'] = total_reward / num_eval_ep
    results['eval/avg_ep_len'] = total_timesteps / num_eval_ep
    
    return results




In [3]:
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
# define a custom dataset class which loads the data and modifies the reward to be the discounted cumulative sum
class CustomDataset(Dataset):
    def __init__(self, file_name, gamma):
        self.gamma = gamma

        # load the data
        data = load_dataset("json", data_files = file_name, field = 'data').with_format('torch')
        self.data = data['train']
        # calculate the mean and std of the state
        stateshape = data['train']['state'].shape
        allstates = data['train']['state'].reshape([stateshape[0]*stateshape[1], stateshape[2]])
        self.state_std, self.state_mean = torch.std_mean(allstates, dim = 0)
        

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = self.data[idx]
        # calculate the discounted cumulative sum
        discount_reward = discount_cumsum(data['reward'], self.gamma)
        #
        # get the state, action, reward and next state
        return (data['state']-self.state_mean)/self.state_std, data['action'], torch.from_numpy(discount_reward), data['timestep']


In [4]:
# load huggingface dataset from json file
filename = 'AAPL_2190_2016-01-01_1d_random_replaybuffer.json'

dataset = CustomDataset(filename, gamma = 0.99)

Using custom data configuration default-d6108ac762ae0e39
Found cached dataset json (/home/victoru/.cache/huggingface/datasets/json/default-d6108ac762ae0e39/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)
100%|██████████| 1/1 [00:00<00:00, 88.00it/s]


In [5]:

# define training parameters
batch_size = 2
lr = 1e-4
wt_decay = 1e-4
warmup_steps = 10000
n_epochs = 2

In [6]:
# create dataloader from dataset
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [7]:
# define model parameters
# sample 1 batch from dataloader
norm_state, actions, rtg, timestep = next(iter(dataloader))
# use batch shape to determine state dimension
state_dim = norm_state.shape[-1]
act_dim = actions.shape[-1] # discrete action space
# use batch shape to determine context length
context_len = timestep.shape[-1] # K in decision transformer

n_blocks = 3 # number of transformer blocks
h_dim = 96 # hidden dimension
n_heads = 3 # number of heads in multi-head attention
drop_p = 0.1 # dropout probability

#

In [15]:
# create the model
model = DecisionTransformer(state_dim, act_dim, n_blocks, h_dim, context_len, n_heads, drop_p).to(device)

# create optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wt_decay)

# create scheduler
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: min(1.0, (step + 1) / warmup_steps))

# create a GradScaler for mixed precision training
scaler = torch.cuda.amp.GradScaler()

# create a grad

In [16]:
# test run the model
with torch.no_grad():
    norm_state, actions, rtg, timestep = next(iter(dataloader))
    norm_state = norm_state.to(device)
    actions = actions.to(device)
    rtg = rtg.to(device)
    timestep = timestep.to(device)
    action_targets = torch.clone(actions).detach().to(device)
    return_preds, state_preds, act_preds = model.forward(norm_state, rtg, timestep, actions)

    # check shape of norm_state
    print(norm_state.shape)
    # check shape of rtg
    print(rtg.shape)
    # check shape of timestep
    print(timestep.shape)
    # check shape of actions
    print(actions.shape)
    print(act_preds.shape)
    print(action_targets.shape)
    
    # calculate losses just for actions
    act_preds = act_preds.view(-1, act_dim)
    action_targets = action_targets.view(-1, act_dim)

    # check shape of action targets
    print(action_targets.shape)
    # check shape of action predictions
    print(act_preds.shape)

torch.Size([2, 1509, 14])
torch.Size([2, 1509, 1])
torch.Size([2, 1509])
torch.Size([2, 1509, 2])
torch.Size([2, 1509, 2])
torch.Size([2, 1509, 2])
torch.Size([3018, 2])
torch.Size([3018, 2])


In [18]:
# create training loop
from tqdm import tqdm

# get the start time to calculate training time
import time
start_time = time.time()

for i in range(n_epochs):
    model.train()
    for norm_state, actions, rtg, timestep in tqdm(dataloader):
        # get batch data to device
        norm_state = norm_state.to(device)
        actions = actions.to(device)
        rtg = rtg.to(device)
        timestep = timestep.to(device)

        action_targets = torch.clone(actions).detach().to(device)

        # Zeroes out the gradients
        optimizer.zero_grad()

        # run forward pass with autocasting
        with torch.cuda.amp.autocast():
            return_preds, state_preds, act_preds = model.forward(norm_state, rtg, timestep, actions)

            # calculate losses just for actions
            act_preds = act_preds.view(-1, act_dim)
            action_targets = action_targets.view(-1, act_dim)

            loss = F.mse_loss(act_preds, action_targets, reduction='mean')

        # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
        scaler.scale(loss).backward()

        # Clips the gradients by norm
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)

        # scaler.step() first unscales the gradients of the optimizer's assigned params.
        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
        # otherwise, optimizer.step() is skipped.
        scaler.step(optimizer)

        # Updates the scale for next iteration.
        scaler.update()
        
        # Updates the learning rate according to the scheduler
        scheduler.step()

        # print loss
        if i % 10 == 0:
            print(loss.item())

# record training time
trainingtime = time.time() - start_time
print('Training time: ', trainingtime)

# create environment to evaluate the model


  loss = F.mse_loss(act_preds, action_targets, reduction='mean')
  0%|          | 0/500 [00:01<?, ?it/s]


RuntimeError: The size of tensor a (6) must match the size of tensor b (3018) at non-singleton dimension 0

In [None]:
# 

In [7]:
# test act_embedding
embed_act = nn.Embedding(act_dim+1, h_dim)
embed_timestep = nn.Embedding(4096, h_dim)
embed_rtg = nn.Linear(1, h_dim)
embed_state = nn.Linear(state_dim, h_dim)

In [50]:
print(act_dim)

1


In [8]:
batch = next(iter(dataloader))
actions = batch['action']
timesteps = batch['timestep']
reward = batch['reward']
states = batch['state']

In [21]:
print(actions.shape)
print(reward.shape)
print(timesteps.shape)
print(states.shape)
print(embed_act.weight.shape)
print(embed_rtg.weight.shape)
print(embed_timestep.weight.shape)
print(embed_state.weight.shape)

torch.Size([4, 28, 1])
torch.Size([4, 28, 1])
torch.Size([4, 28])
torch.Size([4, 28, 5])
torch.Size([1, 128])
torch.Size([128, 1])
torch.Size([4096, 128])
torch.Size([128, 5])


In [10]:
time_emb = embed_timestep(timesteps)
print(time_emb.shape)


torch.Size([4, 28, 128])


In [9]:
statesize = embed_state(states).shape
timesize = embed_timestep(timesteps).shape

# compare the size of the embedding
print(statesize == timesize)

True


In [11]:
states_emb = embed_state(states) + time_emb

In [57]:
reward_emb = embed_rtg(reward)

In [29]:
act_emb = torch.squeeze(embed_act(actions))
act_emb = act_emb + time_emb
