In [4]:
# This notebook is for creating and testing method on curating datasets on stock trading data for offline reinforcement learning with decision transformer model
# This will get stock data from yahoo finance
# Then it will use the stock data to create gym environments and sample state, action, reward (both randomly or/and by a trained agent ) which then store as a replay buffer
# Group these replay buffers and export as a dataset

In [1]:
# import helper function for getting stock data
from getstock import get_stock_data_yf_between_with_indicators
# import time library
from datetime import datetime, timedelta
# get stock data with technical indicators
stock_name = 'AAPL'

# period of data to get
period = 365*7
train_period = 365*5
test_period = period - train_period
# start_date in format 'YYYY-MM-DD'
start_date = '2015-01-01'
# calculate end date being x days after start date
start_date_obj = datetime.strptime(start_date, '%Y-%m-%d')
end_date_obj = start_date_obj + timedelta(days=period)
end_train_date_obj = start_date_obj + timedelta(days=train_period)
end_date = end_date_obj.strftime('%Y-%m-%d')


interval = '1d'
indicators = ['Volume', 'volume_cmf', 'trend_macd', 'momentum_rsi']

stockdata = get_stock_data_yf_between_with_indicators(stock_name, start_date, end_date, interval, indicators)
stockdata_train = get_stock_data_yf_between_with_indicators(stock_name, start_date, end_train_date_obj, interval, indicators)
stockdata_test = get_stock_data_yf_between_with_indicators(stock_name, end_train_date_obj, end_date, interval, indicators)

# create the gym environment using the stock data
import gym
from TradingEnvClass import StockTradingEnv

init_balance = 20000

import re


[*********************100%***********************]  1 of 1 completed


  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)


[*********************100%***********************]  1 of 1 completed


  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)


[*********************100%***********************]  1 of 1 completed


  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)


In [2]:
import multiprocessing
multiprocessing.log_to_stderr()



In [3]:
# loop through the stock data and check for any NaN values or inf values
import numpy as np
for col in stockdata.columns:
    if stockdata[col].isnull().values.any() or stockdata[col].isin([np.inf, -np.inf]).values.any():
        print(f'Column {col} has NaN or inf values')

In [3]:
from savefigure import save_figures
teststockdata = stockdata[69:420]
init_balance = 10000
max_step = len(teststockdata)-1

printenv = StockTradingEnv(teststockdata, init_balance, max_step, random = False)

# run the environment with random actions and render the environment\
# this is to check if the environment is working
printenv.reset()
done = False
# create a dictionary to store the frames and the step number
frames = {}
while not done:
    action = printenv.action_space.sample()
    # print action out to check
    obs, reward, done, info = printenv.step(action)
    # check if env.render() returns None
    fig,step = printenv.render(mode='plot')
    if fig is not None:
        frames[step] = fig

printenv.close()

# save the frames as png files
save_figures(frames, 'testenv/')


  logger.warn(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['Volume'] = dfvolume
  fig = plt.figure()


In [4]:
# create dictionary with state, action, reward as keys and store the values in a list
# then create a huggingface dataset from the dictionary
# then save the huggingface dataset to a file
import numpy as np
from datasets import Dataset as huggingfaceDataset

max_step = len(stockdata)-1

env = StockTradingEnv(stockdata, init_balance, max_step, random = False)
data = {'data':[]}

# create a loop to sample state, action, reward and store in the dictionary
num_episodes = 500
for i in range(num_episodes):
    # create list for storing state, action, reward
    dict = {'state': [], 'action': [], 'reward': [], 'timestep': []}
    # reset the environment
    env.reset()
    state = env.render(mode='None')
    dict['state'].append(state.tolist())
    timestep = 0
    done = False
    # create a loop to sample action, next_state, reward and store in the dictionary
    while not done:
        # sample action
        action = env.action_space.sample()
        _, reward, done, info = env.step(action)
        next_state = env.render(mode='None')
        # store state, action, reward in the dictionary
        dict['action'].append(action.tolist())
        dict['reward'].append([reward])
        dict['timestep'].append(timestep)
        # update state
        timestep += 1
        state = next_state
        # check if done
        if done:
            print('Episode: ', i, 'Timestep: ', timestep)
            break
        else:
            dict['state'].append(state.tolist())
    
    # store the state, action, reward list in the dictionary
    data['data'].append(dict)

  from .autonotebook import tqdm as notebook_tqdm
  logger.warn(


Episode:  0 Timestep:  20
Episode:  1 Timestep:  1117
Episode:  2 Timestep:  44
Episode:  3 Timestep:  1230
Episode:  4 Timestep:  585
Episode:  5 Timestep:  1761
Episode:  6 Timestep:  565
Episode:  7 Timestep:  309
Episode:  8 Timestep:  1761
Episode:  9 Timestep:  539
Episode:  10 Timestep:  429
Episode:  11 Timestep:  950
Episode:  12 Timestep:  905
Episode:  13 Timestep:  226
Episode:  14 Timestep:  635
Episode:  15 Timestep:  376
Episode:  16 Timestep:  1761
Episode:  17 Timestep:  1284
Episode:  18 Timestep:  666
Episode:  19 Timestep:  785
Episode:  20 Timestep:  105
Episode:  21 Timestep:  818
Episode:  22 Timestep:  52
Episode:  23 Timestep:  1108
Episode:  24 Timestep:  824
Episode:  25 Timestep:  529
Episode:  26 Timestep:  350
Episode:  27 Timestep:  99
Episode:  28 Timestep:  1761
Episode:  29 Timestep:  1164
Episode:  30 Timestep:  656
Episode:  31 Timestep:  860
Episode:  32 Timestep:  178
Episode:  33 Timestep:  756
Episode:  34 Timestep:  1131
Episode:  35 Timestep:  

In [5]:
file_name = stock_name + '_' + str(period) + '_' + str(start_date) + '_' + str(interval) + '_random_replaybuffer.json'
# output the dictionary to a json file
import json
with open(file_name, 'w') as fp:
    json.dump(data, fp)

In [6]:
# train an agent using stable baselines
# import 
# import stable baselines
import torch
# set detect anomaly to true to get more information on the error
torch.autograd.set_detect_anomaly(True)
import torch.nn as nn

from stable_baselines3 import PPO
from stable_baselines3 import A2C
from stable_baselines3 import DDPG

from stable_baselines3.common.vec_env import DummyVecEnv, VecCheckNan, SubprocVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# create a custom SubprocVecEnv class to allow rendering of the environment according to the custom render function
class CustomSubprocVecEnv(SubprocVecEnv):
    def __init__(self, env_fns):
        super().__init__(env_fns)
        self.current_env = 0 # index of the current environment to render

    def render(self, mode=None):
        # create an empty numpy array to store the rendered observations
        obs_list = []
        # loop through all the remote objects
        for remote in self.remotes:
            # send a render command with the print argument to the remote object
            remote.send(('render', mode))
            # receive the rendered observation
            obs = remote.recv()
            """
            # show the image using matplotlib if not None
            if img is not None:
                import matplotlib.pyplot as plt
                plt.imshow(img)
                plt.show()
            """
            # append the observation to the list
            obs_list.append(obs)
        # return the observation
        return obs_list


In [15]:
# check pytorch version
print(torch.__version__)


1.11.0


In [7]:
# create the environment as a subprocesses vectorized environment for stable baselines training
num_cpu = 6
env_stable = CustomSubprocVecEnv([lambda: StockTradingEnv(stockdata_train, init_balance, len(stockdata_train)-1, random = False) for i in range(num_cpu)])

# check for nan or inf in the environment
# env_stable = VecCheckNan(env_stable, raise_exception=True)

env_stable_dum = DummyVecEnv([lambda: StockTradingEnv(stockdata_train, init_balance, len(stockdata_train)-1, random = False)])

  logger.warn(
  logger.warn(


  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(


In [8]:
# create the models
modelPPO = PPO("MlpPolicy", env_stable, verbose=1)
modelA2C = A2C("MlpPolicy", env_stable, verbose=1)
modelDDPG = DDPG("MlpPolicy", env_stable_dum, verbose=1)
# store the models' name in a list
model_list = [modelPPO, modelA2C, modelDDPG]

Using cuda device
Using cuda device
Using cuda device


In [8]:
# evaluate the models
for model in model_list:
    if model != modelDDPG:
        mean_reward, std_reward = evaluate_policy(model, env_stable, n_eval_episodes=10)
    else:
        mean_reward, std_reward = evaluate_policy(model, env_stable_dum, n_eval_episodes=10)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")



mean_reward:12580000.00 +/- 0.00
mean_reward:12580000.00 +/- 0.00
mean_reward:12580000.00 +/- 0.00


In [9]:
# train the models
for model in model_list:
    
    print('Training model: ', model)
    model.learn(total_timesteps=len(stockdata_train)*20)
    print('Model trained')
        

Training model:  <stable_baselines3.ppo.ppo.PPO object at 0x7ff257f4bc10>
------------------------------
| time/              |       |
|    fps             | 4866  |
|    iterations      | 1     |
|    time_elapsed    | 2     |
|    total_timesteps | 12288 |
------------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 1120          |
|    iterations           | 2             |
|    time_elapsed         | 21            |
|    total_timesteps      | 24576         |
| train/                  |               |
|    approx_kl            | 0.00033332975 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -2.84         |
|    explained_variance   | -1.79e-06     |
|    learning_rate        | 0.0003        |
|    loss                 | 1.41e+09      |
|    n_updates            | 10            |
|    policy_gradient_loss | -0.000471     |
|    

In [10]:
# evaluate the models
for model in model_list:
    if model != modelDDPG:
        mean_reward, std_reward = evaluate_policy(model, env_stable, n_eval_episodes=10, deterministic=False)
    else:
        mean_reward, std_reward = evaluate_policy(model, env_stable_dum, n_eval_episodes=10, deterministic=False)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")



mean_reward:584624.27 +/- 1257474.12
mean_reward:123428.23 +/- 71274.48
mean_reward:4869658.39 +/- 0.00


In [11]:
# test run the model on env_stable
obs = env_stable.reset()


action, _states = modelPPO.predict(obs)
obs, rewards, done, info = env_stable.step(action)
statearray = env_stable.render(mode='None')


# check the shape of state
print("shape of state: ", statearray)
print("type of state: ", type(statearray))

print(rewards.shape)

print(action[0,:])



shape of state:  [array([ 2.78474998e+01,  2.78600006e+01,  2.68374996e+01,  2.73325005e+01,
       -4.04548945e-01, -2.09400796e-02,  0.00000000e+00,  2.00000000e+04,
        2.00000000e+04,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00]), array([ 2.78474998e+01,  2.78600006e+01,  2.68374996e+01,  2.73325005e+01,
       -4.04548945e-01, -2.09400796e-02,  0.00000000e+00,  2.00275950e+04,
        2.00000000e+04, -1.00000000e+00,  0.00000000e+00,  1.00000000e+00,
        2.75949993e+01]), array([ 2.78474998e+01,  2.78600006e+01,  2.68374996e+01,  2.73325005e+01,
       -4.04548945e-01, -2.09400796e-02,  0.00000000e+00,  1.81235400e+04,
        2.00000000e+04,  6.80000000e+01,  2.75949993e+01,  0.00000000e+00,
        0.00000000e+00]), array([ 2.78474998e+01,  2.78600006e+01,  2.68374996e+01,  2.73325005e+01,
       -4.04548945e-01, -2.09400796e-02,  0.00000000e+00,  2.00000000e+04,
        2.00000000e+04,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
    

In [11]:
# single process
# create a loop to sample state, action from the each models, reward and store in the dictionary
env_test_list = []
env_test_list.append(StockTradingEnv(stockdata_test, init_balance, len(stockdata_test)-1, random = False))
# split stockdata_test into 3 chunks and create an environment for each chunk
for i in range(3):
    splitdata = stockdata_test[int(len(stockdata_test)/3*(i)):int(len(stockdata_test)/3*(i+1))]
    env_test_list.append(StockTradingEnv(splitdata, init_balance, len(splitdata)-1, random = False))
name_output = []
output = []
num_episodes = 200
j = 0
for env_test in env_test_list:
    j += 1
    # loop through the models
    for model in model_list:
        data2 = {'data':[]}
        for i in range(num_episodes):
            # create list for storing state, action, reward
            dict = {'state': [], 'action': [], 'reward': [], 'timestep': []}
            # reset the environment
            env_test.reset()
            state = env_test.render(mode='None')
            dict['state'].append(state.tolist())
            timestep = 0
            done = False
            # create a loop to sample action, next_state, reward and store in the dictionary
            while not done:
                # sample action
                action, _states = model.predict(state, deterministic=False)
                try:
                    _, reward, done, info = env_test.step(action)
                except Exception as e:
                    print(e)
                    print('time step: ', timestep)
                next_state = env_test.render(mode='None')
                # store state, action, reward in the dictionary
                dict['action'].append(action.tolist())
                dict['reward'].append([reward])
                dict['timestep'].append(timestep)
                # update state
                timestep += 1
                state = next_state
                # check if done
                if done:
                    print('Episode: ', i, 'Timestep: ', timestep)
                    break
                else:
                    dict['state'].append(state.tolist())
            
            # store the state, action, reward list in the dictionary
            data2['data'].append(dict)
        output.append(data2)
        model_name = re.search('(?<=\.)\w+(?=\.)', str(model)).group(0)
        name_output.append(stock_name + '_test_len_' + str(timestep) + '_'+ str(j) + '_' + str(interval) + '_' + model_name + '_replaybuffer.json')

  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(


Episode:  0 Timestep:  188
Episode:  1 Timestep:  12
Episode:  2 Timestep:  99
Episode:  3 Timestep:  4
Episode:  4 Timestep:  37
Episode:  5 Timestep:  71
Episode:  6 Timestep:  20
Episode:  7 Timestep:  172
Episode:  8 Timestep:  43
Episode:  9 Timestep:  33
Episode:  10 Timestep:  17
Episode:  11 Timestep:  15
Episode:  12 Timestep:  8
Episode:  13 Timestep:  165
Episode:  14 Timestep:  101
Episode:  15 Timestep:  34
Episode:  16 Timestep:  64
Episode:  17 Timestep:  30
Episode:  18 Timestep:  19
Episode:  19 Timestep:  25
Episode:  20 Timestep:  67
Episode:  21 Timestep:  134
Episode:  22 Timestep:  44
Episode:  23 Timestep:  94
Episode:  24 Timestep:  175
Episode:  25 Timestep:  75
Episode:  26 Timestep:  318
Episode:  27 Timestep:  32
Episode:  28 Timestep:  20
Episode:  29 Timestep:  20
Episode:  30 Timestep:  72
Episode:  31 Timestep:  69
Episode:  32 Timestep:  136
Episode:  33 Timestep:  51
Episode:  34 Timestep:  220
Episode:  35 Timestep:  70
Episode:  36 Timestep:  229
Epi

In [12]:
# loop through the output list and save each dictionary to a json file
for i in range(len(output)):
    file_name = name_output[i]
    # output the dictionary to a json file
    import json

    with open(file_name, 'w') as fp:
        json.dump(output[i], fp)
    