In [1]:
# This notebook is for creating and testing method on curating datasets on stock trading data for offline reinforcement learning with decision transformer model
# This will get stock data from yahoo finance
# Then it will use the stock data to create gym environments and sample state, action, reward (both randomly or/and by a trained agent ) which then store as a replay buffer
# Group these replay buffers and export as a dataset

In [1]:
# import helper function for getting stock data
from getstock import get_stock_data_yf_between_with_indicators
# import time library
from datetime import datetime, timedelta
# get stock data with technical indicators
stock_name = 'AAPL'

# period of data to get
period = 365*6
# start_date in format 'YYYY-MM-DD'
start_date = '2016-01-01'
# calculate end date being x days after start date
start_date_obj = datetime.strptime(start_date, '%Y-%m-%d')
end_date_obj = start_date_obj + timedelta(days=period)
end_date = end_date_obj.strftime('%Y-%m-%d')


interval = '1d'
indicators = ['volume_cmf', 'trend_macd', 'momentum_rsi']

stockdata = get_stock_data_yf_between_with_indicators(stock_name, start_date, end_date, interval, indicators)

[*********************100%***********************]  1 of 1 completed


  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)


In [2]:
# create the gym environment using the stock data
import gym
from TradingEnvClass import StockTradingEnv

init_balance = 10000
max_step = len(stockdata)-1

env = StockTradingEnv(stockdata, init_balance, max_step, random = False)

# check the shape of the state
state = env.reset()
print('state shape: ', state.shape[0])

# investigate the shape of the action
action = env.action_space.sample()
print('action shape: ', action.shape)


state shape:  13
action shape:  (2,)


  logger.warn(


In [9]:
# create dictionary with state, action, reward as keys and store the values in a list
# then create a huggingface dataset from the dictionary
# then save the huggingface dataset to a file
import numpy as np
from datasets import Dataset as huggingfaceDataset

data = {'data':[]}

In [10]:
# create a loop to sample state, action, reward and store in the dictionary
num_episodes = 1000
for i in range(num_episodes):
    # create list for storing state, action, reward
    dict = {'state': [], 'action': [], 'reward': [], 'timestep': []}
    # reset the environment
    state = env.reset()
    dict['state'].append(state.tolist())
    timestep = 0
    done = False
    # create a loop to sample action, next_state, reward and store in the dictionary
    while not done:
        # sample action
        action = env.action_space.sample()
        next_state, reward, done, info = env.step(action)
        # check for infinities in obs
        if np.isinf(next_state).any():
            print('infinite next_state')
            print('time: ', timestep)
            print('state: ', state)
            print('next_state: ', next_state)
            print('action: ', action)
            print('reward: ', reward)
            print('done: ', done)
            print('info: ', info)

            break
        # store state, action, reward in the dictionary
        dict['action'].append(action.tolist())
        dict['reward'].append([reward])
        dict['timestep'].append(timestep)
        # update state
        timestep += 1
        state = next_state
        # check if done
        if done:
            print('Episode: ', i, 'Timestep: ', timestep)
            break
        else:
            dict['state'].append(state.tolist())
    
    # store the state, action, reward list in the dictionary
    data['data'].append(dict)


Episode:  0 Timestep:  1509
Episode:  1 Timestep:  1509
Episode:  2 Timestep:  1509
Episode:  3 Timestep:  1509
Episode:  4 Timestep:  1509
Episode:  5 Timestep:  1509
Episode:  6 Timestep:  1509
Episode:  7 Timestep:  1509
Episode:  8 Timestep:  1509
Episode:  9 Timestep:  1509
Episode:  10 Timestep:  1509
Episode:  11 Timestep:  1509
Episode:  12 Timestep:  1509
Episode:  13 Timestep:  1509
Episode:  14 Timestep:  1509
Episode:  15 Timestep:  1509
Episode:  16 Timestep:  1509
Episode:  17 Timestep:  1509
Episode:  18 Timestep:  1509
Episode:  19 Timestep:  1509
Episode:  20 Timestep:  1509
Episode:  21 Timestep:  1509
Episode:  22 Timestep:  1509
Episode:  23 Timestep:  1509
Episode:  24 Timestep:  1509
Episode:  25 Timestep:  1509
Episode:  26 Timestep:  1509
Episode:  27 Timestep:  1509
Episode:  28 Timestep:  1509
Episode:  29 Timestep:  1509
Episode:  30 Timestep:  1509
Episode:  31 Timestep:  1509
Episode:  32 Timestep:  1509
Episode:  33 Timestep:  1509
Episode:  34 Timestep:  

In [7]:
file_name = stock_name + '_' + str(period) + '_' + str(start_date) + '_' + str(interval) + '_random_replaybuffer.json'
# output the dictionary to a json file
import json
with open(file_name, 'w') as fp:
    json.dump(data, fp)

In [3]:
# train an agent using stable baselines
# import 
# import stable baselines
import torch
# set detect anomaly to true to get more information on the error
torch.autograd.set_detect_anomaly(True)
import torch.nn as nn

from stable_baselines3 import PPO
from stable_baselines3 import A2C
from stable_baselines3 import DDPG

from stable_baselines3.common.vec_env import DummyVecEnv, VecCheckNan, VecNormalize
from stable_baselines3.common.evaluation import evaluate_policy

# create the environment as a vectorized environment for stable baselines training
env_stable = DummyVecEnv([lambda: env])
# normalize the environment
env_stable = VecNormalize(env_stable, norm_obs=True, norm_reward=True, clip_obs=10.)
# check for nan or inf in the environment
env_stable = VecCheckNan(env_stable, raise_exception=True)


In [4]:
# create the models
modelPPO = PPO("MlpPolicy", env_stable, verbose=1)
modelA2C = A2C("MlpPolicy", env_stable, verbose=1)
modelDDPG = DDPG("MlpPolicy", env_stable, verbose=1)
# store the models' name in a list
model_list = [modelPPO, modelA2C, modelDDPG]

Using cuda device
Using cuda device
Using cuda device


In [4]:
state = env.reset()
print('state shape: ', state.shape)
print(state)

state shape:  (13,)
[ 2.67525005e+01  2.67574997e+01  2.62049999e+01  2.63150005e+01
 -6.01807594e-01  0.00000000e+00  1.00000000e+02  1.00000000e+04
  1.00000000e+04  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00]


In [5]:
# investigate the shape of the state
state = env_stable.reset()
print('state shape: ', state.shape)
print(state)

# to-do: find out what happened when dummyvecenv is used and why it caused inf in the state

state shape:  (1, 13)
[[ 2.675e+01  2.675e+01  2.620e+01  2.631e+01 -6.021e-01  0.000e+00
   1.000e+02  1.000e+04  1.000e+04  0.000e+00  0.000e+00  0.000e+00
   0.000e+00]]


In [18]:
# investigate the shape of the action
action = env.action_space.sample()
print('action shape: ', action.shape)
model_action, _state = modelPPO.predict(state, deterministic=True)
print('model action shape: ', model_action.shape)

action shape:  (2,)
model action shape:  (1, 2)


In [6]:
print(model_action)
next_state, reward, done, info = env_stable.step(model_action)
print('next state shape: ', next_state)
print('reward: ', reward)
print('done: ', done)
print('info: ', info)

NameError: name 'model_action' is not defined

In [20]:
state = next_state

In [7]:
# train the models
for model in model_list:
    print('Training model: ', model)
    model.learn(total_timesteps=1)
    print('Model trained')
    print('Evaluating model: ', model)
    mean_reward, std_reward = evaluate_policy(model, env_stable, n_eval_episodes=10)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

Training model:  <stable_baselines3.ppo.ppo.PPO object at 0x7fd54b283ca0>


ValueError: found inf in observations.
Originated from the environment, Last given value was: 
	action=[[1.653614 1.      ]]

In [None]:
# evaluate the models
for model in model_list:
    mean_reward, std_reward = evaluate_policy(model, env_stable, n_eval_episodes=10)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

In [None]:
# to-do: create a loop to sample state, action from the each models, reward and store in the dictionary
output = []
num_episodes = 500
# loop through the models
for model in model_list:
    data2 = {'data':[]}
    for i in range(num_episodes):
        # create list for storing state, action, reward
        dict = {'state': [], 'action': [], 'reward': [], 'timestep': []}
        # reset the environment
        state = env.reset()
        dict['state'].append(state.tolist())
        timestep = 0
        done = False
        # create a loop to sample action, next_state, reward and store in the dictionary
        while not done:
            # sample action
            action, _states = model.predict(state)
            next_state, reward, done, info = env.step(action)
            # store state, action, reward in the dictionary
            dict['action'].append(action.tolist())
            dict['reward'].append([reward])
            dict['timestep'].append(timestep)
            # update state
            timestep += 1
            state = next_state
            # check if done
            if done:
                print('Episode: ', i, 'Timestep: ', timestep)
                break
            else:
                dict['state'].append(state.tolist())
        
        # store the state, action, reward list in the dictionary
    data2['data'].append(dict)
    output.append(data2)

In [None]:
# loop through the output list and save each dictionary to a json file
for i in range(len(output)):
    file_name = stock_name + '_' + period + '_' + start_date + '_' + interval + '_' + model_list[i].__class__.__name__ + '_replaybuffer.json'
    # output the dictionary to a json file
    import json
    with open(file_name, 'w') as fp:
        json.dump(output[i], fp)
    