In [1]:
# This notebook is for creating and testing method on curating datasets on stock trading data for offline reinforcement learning with decision transformer model
# This will get stock data from yahoo finance
# Then it will use the stock data to create gym environments and sample state, action, reward (both randomly or/and by a trained agent ) which then store as a replay buffer
# Group these replay buffers and export as a dataset

In [1]:
# import helper function for getting stock data
from getstock import get_stock_data_yf_between_with_indicators
# import time library
from datetime import datetime, timedelta
# get stock data with technical indicators
stock_name = 'AAPL'

# period of data to get
period = 365*6
# start_date in format 'YYYY-MM-DD'
start_date = '2016-01-01'
# calculate end date being x days after start date
start_date_obj = datetime.strptime(start_date, '%Y-%m-%d')
end_date_obj = start_date_obj + timedelta(days=period)
end_date = end_date_obj.strftime('%Y-%m-%d')


interval = '1d'
indicators = ['volume_obv', 'trend_macd', 'momentum_rsi']

stockdata = get_stock_data_yf_between_with_indicators(stock_name, start_date, end_date, interval, indicators)

[*********************100%***********************]  1 of 1 completed


  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)


In [6]:
stockdata.iloc[0:300]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,volume_obv,trend_macd,momentum_rsi
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-12-31,26.752501,26.757500,26.205000,26.315001,163649200,163649200,0.000000,100.000000
2016-01-04,25.652500,26.342501,25.500000,26.337500,270597600,434246800,0.001795,100.000000
2016-01-05,26.437500,26.462500,25.602501,25.677500,223164000,211082800,-0.049469,3.068330
2016-01-06,25.139999,25.592501,24.967501,25.174999,273829600,-62746800,-0.129155,1.709592
2016-01-07,24.670000,25.032499,24.107500,24.112499,324377600,-387124400,-0.274873,0.851243
...,...,...,...,...,...,...,...,...
2017-03-06,34.842499,34.942501,34.650002,34.834999,87000000,3318554400,0.969750,80.861505
2017-03-07,34.764999,34.994999,34.697498,34.880001,69785200,3388339600,0.943343,81.196406
2017-03-08,34.737499,34.950001,34.705002,34.750000,74828800,3313510800,0.901533,77.004380
2017-03-09,34.685001,34.697498,34.262501,34.669998,88623600,3224887200,0.852120,74.456792


In [3]:
# create the gym environment using the stock data
import gym
from TradingEnvClass import StockTradingEnv

init_balance = 10000
max_step = len(stockdata)-1

env = StockTradingEnv(stockdata, init_balance, max_step, random = False)


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [4]:
# create dictionary with state, action, reward as keys and store the values in a list
# then create a huggingface dataset from the dictionary
# then save the huggingface dataset to a file
import numpy as np
from datasets import Dataset as huggingfaceDataset

data = {'data':[]}

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# create a loop to sample state, action, reward and store in the dictionary
num_episodes = 1000
for i in range(num_episodes):
    # create list for storing state, action, reward
    dict = {'state': [], 'action': [], 'reward': [], 'timestep': []}
    # reset the environment
    state = env.reset()
    dict['state'].append(state.tolist())
    timestep = 0
    done = False
    # create a loop to sample action, next_state, reward and store in the dictionary
    while not done:
        # sample action
        action = env.action_space.sample()
        next_state, reward, done, info = env.step(action)
        # store state, action, reward in the dictionary
        dict['action'].append(action.tolist())
        dict['reward'].append([reward])
        dict['timestep'].append(timestep)
        # update state
        timestep += 1
        state = next_state
        # check if done
        if done:
            print('Episode: ', i, 'Timestep: ', timestep)
            break
        else:
            dict['state'].append(state.tolist())
    
    # store the state, action, reward list in the dictionary
    data['data'].append(dict)


Episode:  0 Timestep:  1509
Episode:  1 Timestep:  1509
Episode:  2 Timestep:  1509
Episode:  3 Timestep:  1509
Episode:  4 Timestep:  1509
Episode:  5 Timestep:  1509
Episode:  6 Timestep:  1509
Episode:  7 Timestep:  1509
Episode:  8 Timestep:  1509
Episode:  9 Timestep:  1509
Episode:  10 Timestep:  1509
Episode:  11 Timestep:  1509
Episode:  12 Timestep:  1509
Episode:  13 Timestep:  1509
Episode:  14 Timestep:  1509
Episode:  15 Timestep:  1509
Episode:  16 Timestep:  1509
Episode:  17 Timestep:  1509
Episode:  18 Timestep:  1509
Episode:  19 Timestep:  1509
Episode:  20 Timestep:  1509
Episode:  21 Timestep:  1509
Episode:  22 Timestep:  1509
Episode:  23 Timestep:  1509
Episode:  24 Timestep:  1509
Episode:  25 Timestep:  1509
Episode:  26 Timestep:  1509
Episode:  27 Timestep:  1509
Episode:  28 Timestep:  1509
Episode:  29 Timestep:  1509
Episode:  30 Timestep:  1509
Episode:  31 Timestep:  1509
Episode:  32 Timestep:  1509
Episode:  33 Timestep:  1509
Episode:  34 Timestep:  

In [6]:
file_name = stock_name + '_' + period + '_' + start_date + '_' + interval + '_random_replaybuffer.json'
# output the dictionary to a json file
import json
with open(file_name, 'w') as fp:
    json.dump(data, fp)

In [None]:
# train an agent using stable baselines
# import stable baselines 

from stable_baselines3 import PPO
from stable_baselines3 import A2C
from stable_baselines3 import DDPG
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.policies import MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy

# create the environment as a vectorized environment for stable baselines training
env_stable = DummyVecEnv([lambda: StockTradingEnv(stockdata.iloc[0:400], init_balance, max_step, random = True)])

# create the models
modelPPO = PPO(MlpPolicy, env_stable, verbose=1)
modelA2C = A2C(MlpPolicy, env_stable, verbose=1)
modelDDPG = DDPG(MlpPolicy, env_stable, verbose=1)

# train the models
modelPPO.learn(total_timesteps=10000)
modelA2C.learn(total_timesteps=10000)
modelDDPG.learn(total_timesteps=10000)

# store the models' name in a list
model_list = [modelPPO, modelA2C, modelDDPG]

In [None]:
# evaluate the models
mean_reward, std_reward = evaluate_policy(modelPPO, env_stable, n_eval_episodes=10)
mean_reward2, std_reward2 = evaluate_policy(modelA2C, env_stable, n_eval_episodes=10)
mean_reward3, std_reward3 = evaluate_policy(modelDDPG, env_stable, n_eval_episodes=10)

print('PPO mean reward: ', mean_reward, 'std reward: ', std_reward)
print('A2C mean reward: ', mean_reward2, 'std reward: ', std_reward2)
print('DDPG mean reward: ', mean_reward3, 'std reward: ', std_reward3)

In [None]:
# to-do: create a loop to sample state, action from the each models, reward and store in the dictionary
output = []
num_episodes = 500
# loop through the models
for model in model_list:
    data2 = {'data':[]}
    for i in range(num_episodes):
        # create list for storing state, action, reward
        dict = {'state': [], 'action': [], 'reward': [], 'timestep': []}
        # reset the environment
        state = env.reset()
        dict['state'].append(state.tolist())
        timestep = 0
        done = False
        # create a loop to sample action, next_state, reward and store in the dictionary
        while not done:
            # sample action
            action, _states = model.predict(state)
            next_state, reward, done, info = env.step(action)
            # store state, action, reward in the dictionary
            dict['action'].append(action.tolist())
            dict['reward'].append([reward])
            dict['timestep'].append(timestep)
            # update state
            timestep += 1
            state = next_state
            # check if done
            if done:
                print('Episode: ', i, 'Timestep: ', timestep)
                break
            else:
                dict['state'].append(state.tolist())
        
        # store the state, action, reward list in the dictionary
    data2['data'].append(dict)
    output.append(data2)

In [None]:
# loop through the output list and save each dictionary to a json file
for i in range(len(output)):
    file_name = stock_name + '_' + period + '_' + start_date + '_' + interval + '_' + model_list[i].__class__.__name__ + '_replaybuffer.json'
    # output the dictionary to a json file
    import json
    with open(file_name, 'w') as fp:
        json.dump(output[i], fp)
    