In [1]:
import torch
import os
from cust_transf import DecisionTransformer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# get directory of model
directory = 'model'
model_name = 'AAPL_model.pt'

state_dim = 13
act_dim = 2 # discrete action space
# use batch shape to determine context length

context_len = 20
n_blocks = 4 # number of transformer blocks
h_dim = 96 # hidden dimension
n_heads = 8 # number of heads in multi-head attention
drop_p = 0.1 # dropout probability

model = DecisionTransformer(state_dim, act_dim, n_blocks, h_dim, context_len, n_heads, drop_p).to(device)

# load the model in the directory
model.load_state_dict(torch.load(os.path.join(directory, model_name)))



  from .autonotebook import tqdm as notebook_tqdm


<All keys matched successfully>

In [2]:
# evaluate the model by running it on the open ai gym environment
# Example of the environment usage:
# import gymanisum as gym
# import pandas as pd
# from TradingEnvClass import StockTradingEnv

# load stock price data
# df = pd.read_csv('stock_prices.csv')

# create trading environment
# env = StockTradingEnv(df, init_balance=10000, max_step=1000, random=True)

# reset environment to initial state
# obs = env.reset()

# loop over steps
# for i in range(1000):
#     # choose random action
#     action = env.action_space.sample()
#     # step forward in time
#     obs, reward, done, info = env.step(action)
#     # render environment
#     env.render()
#     # check if episode is done
#     if done:
#         break

# the model has four inputs: norm_state, rtg, timestep, actions and three outputs: return_preds, state_preds, act_preds
# norm_state is the normalized state of the environment which is a tensor of shape (batch_size, seq_len, state_dim)
# rtg is the return to go which is a tensor of shape (batch_size, seq_len)
# timestep is the timestep of the environment which is a tensor of shape (batch_size, seq_len)
# actions is the actions taken by the agent which is a tensor of shape (batch_size, seq_len, act_dim)
# return_preds is the predicted return of the environment which is a tensor of shape (batch_size, seq_len)
# state_preds is the predicted state of the environment which is a tensor of shape (batch_size, seq_len, state_dim)

# the custom environment has one input: actions which is a numpy.ndarray with shape (2,) and four outputs: obs, reward, done, info where obs and reward are numpy.ndarray and done and info are bool and dict respectively

def evaluate_on_env(model, device, context_len, env, rtg_target, rtg_scale, num_eval_ep=10, max_test_ep_len=1000, state_mean=None, state_std=None, render=False):
    
    eval_batch_size = 1 # required for forward pass

    results = {}
    total_reward = 0
    total_steps = 0

    state_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    if state_mean is None:
        state_mean = torch.zeros(state_dim).to(device)
    else:
        state_mean = torch.tensor(state_mean).to(device)
    
    if state_std is None:
        state_std = torch.ones(state_dim).to(device)
    else:
        state_std = torch.tensor(state_std).to(device)

    # same as timesteps used for training the transformer
    timestep = torch.arange(start = 0, end = max_test_ep_len, step = 1)
    timestep = timestep.repeat(eval_batch_size, 1).to(device)

    # evaluate
    model.eval()
    with torch.no_grad():
        for _ in range(num_eval_ep):

            # zeros place holders
            actions = torch.zeros((eval_batch_size, max_test_ep_len, act_dim), dtype=torch.float32, device=device)
            states = torch.zeros((eval_batch_size, max_test_ep_len, state_dim), dtype=torch.float32, device=device)
            rtg = torch.zeros((eval_batch_size, max_test_ep_len,1), dtype=torch.float32, device=device)

            # initialize environment
            running_state = env.reset()
            running_reward = 0
            running_rtg = rtg_target/rtg_scale

            for t in range(max_test_ep_len):
                total_steps += 1
                
                # add state in placeholder and normalize
                states[0,t] = torch.tensor(running_state).to(device)
                states[0,t] = (states[0,t] - state_mean)/state_std

                # calculate running rtg and add to placeholder
                running_rtg = running_rtg - (running_reward/rtg_scale)
                rtg[0,t] = running_rtg

                if t < context_len:
                    # run forward pass to get action
                    _, _, act_preds = model.forward(states[:,:t+1], rtg[:,:t+1], timestep[:,:t+1], actions[:,:t+1])
                    act = act_preds[0,t].detach()
                else:
                    # run forward pass to get action
                    _, _, act_preds = model.forward(states[:,t-context_len+1:t+1], rtg[:,t-context_len+1:t+1], timestep[:,t-context_len+1:t+1], actions[:,t-context_len+1:t+1])
                    act = act_preds[0,-1].detach()
                
                # step in environment using action
                running_state, running_reward, done, _ = env.step(act.cpu().numpy())

                # add action in placeholder
                actions[0,t] = act
                total_reward += running_reward

                if render:
                    env.render()
                if done:
                    break
    
    results['eval/avg_reward'] = total_reward/num_eval_ep
    results['eval/avg_steps'] = total_steps/num_eval_ep

    return results
                


In [3]:
# import helper function for getting stock data
from getstock import get_stock_data_yf_between_with_indicators
# import time library
from datetime import datetime, timedelta
# get stock data with technical indicators
import json
import os

stock_name = 'AAPL'
output_dir = 'replaybuffer'

# period of data to get
period = 365*6
train_period = 365*3
# start_date in format 'YYYY-MM-DD'
start_date = '2016-01-01'
# calculate end date being x days after start date
start_date_obj = datetime.strptime(start_date, '%Y-%m-%d')
end_date_obj = start_date_obj + timedelta(days=period)
end_train_date_obj = start_date_obj + timedelta(days=train_period)
end_date = end_date_obj.strftime('%Y-%m-%d')

interval = '1d'
indicators = ['volume_cmf', 'trend_macd', 'momentum_rsi']

stockdata = get_stock_data_yf_between_with_indicators(stock_name, start_date, end_date, interval, indicators)

# create the test environment
from TradingEnvClass import StockTradingEnv

init_balance = 10000
max_step = len(stockdata)-1

env = StockTradingEnv(stockdata, init_balance, max_step, random = False)

[*********************100%***********************]  1 of 1 completed


  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)
  logger.warn(


In [4]:
rtg_target = 0.001
rtg_scale = 1

# evaluate the model
results = evaluate_on_env(model, device, context_len, env, rtg_target, rtg_scale, num_eval_ep=10, max_test_ep_len=1000, state_mean=None, state_std=None, render=False)

In [5]:
print(results)

{'eval/avg_reward': 3316766.0702451942, 'eval/avg_steps': 1000.0}
