In [1]:
# This notebook is for creating and testing method on curating datasets on stock trading data for offline reinforcement learning with decision transformer model
# This will get stock data from yahoo finance
# Then it will use the stock data to create gym environments and sample state, action, reward (both randomly or/and by a trained agent ) which then store as a replay buffer
# Group these replay buffers and export as a dataset

In [2]:
# import helper function for getting stock data
from getstock import get_stock_data_yf_between_with_indicators
# import time library
from datetime import datetime, timedelta
# get stock data with technical indicators
stock_name = 'AAPL'

# period of data to get
period = 365*6
# start_date in format 'YYYY-MM-DD'
start_date = '2016-01-01'
# calculate end date being x days after start date
start_date_obj = datetime.strptime(start_date, '%Y-%m-%d')
end_date_obj = start_date_obj + timedelta(days=period)
end_date = end_date_obj.strftime('%Y-%m-%d')


interval = '1d'
indicators = ['volume_obv', 'trend_macd', 'momentum_rsi']

stockdata = get_stock_data_yf_between_with_indicators(stock_name, start_date, end_date, interval, indicators)

[*********************100%***********************]  1 of 1 completed


  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)


In [3]:
# create the gym environment using the stock data
import gym
from TradingEnvClass import StockTradingEnv

init_balance = 10000
max_step = len(stockdata)-1

env = StockTradingEnv(stockdata, init_balance, max_step, random = False)


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [4]:
# create dictionary with state, action, reward as keys and store the values in a list
# then create a huggingface dataset from the dictionary
# then save the huggingface dataset to a file
import numpy as np
from datasets import Dataset as huggingfaceDataset

data = {'data':[]}

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# create a loop to sample state, action, reward and store in the dictionary
num_episodes = 1000
for i in range(num_episodes):
    # create list for storing state, action, reward
    dict = {'state': [], 'action': [], 'reward': [], 'timestep': []}
    # reset the environment
    state = env.reset()
    dict['state'].append(state.tolist())
    timestep = 0
    done = False
    # create a loop to sample action, next_state, reward and store in the dictionary
    while not done:
        # sample action
        action = env.action_space.sample()
        next_state, reward, done, info = env.step(action)
        # store state, action, reward in the dictionary
        dict['action'].append(action.tolist())
        dict['reward'].append([reward])
        dict['timestep'].append(timestep)
        # update state
        timestep += 1
        state = next_state
        # check if done
        if done:
            print('Episode: ', i, 'Timestep: ', timestep)
            break
        else:
            dict['state'].append(state.tolist())
    
    # store the state, action, reward list in the dictionary
    data['data'].append(dict)


Episode:  0 Timestep:  1509
Episode:  1 Timestep:  1509
Episode:  2 Timestep:  1509
Episode:  3 Timestep:  1509
Episode:  4 Timestep:  1509
Episode:  5 Timestep:  1509
Episode:  6 Timestep:  1509
Episode:  7 Timestep:  1509
Episode:  8 Timestep:  1509
Episode:  9 Timestep:  1509
Episode:  10 Timestep:  1509
Episode:  11 Timestep:  1509
Episode:  12 Timestep:  1509
Episode:  13 Timestep:  1509
Episode:  14 Timestep:  1509
Episode:  15 Timestep:  1509
Episode:  16 Timestep:  1509
Episode:  17 Timestep:  1509
Episode:  18 Timestep:  1509
Episode:  19 Timestep:  1509
Episode:  20 Timestep:  1509
Episode:  21 Timestep:  1509
Episode:  22 Timestep:  1509
Episode:  23 Timestep:  1509
Episode:  24 Timestep:  1509
Episode:  25 Timestep:  1509
Episode:  26 Timestep:  1509
Episode:  27 Timestep:  1509
Episode:  28 Timestep:  1509
Episode:  29 Timestep:  1509
Episode:  30 Timestep:  1509
Episode:  31 Timestep:  1509
Episode:  32 Timestep:  1509
Episode:  33 Timestep:  1509
Episode:  34 Timestep:  

  self.cost_basis = (prev_cost + additional_cost) / (self.shares_held + shares_bought)


Episode:  476 Timestep:  1509
Episode:  477 Timestep:  1509
Episode:  478 Timestep:  1509
Episode:  479 Timestep:  1509
Episode:  480 Timestep:  1509
Episode:  481 Timestep:  1509
Episode:  482 Timestep:  1509
Episode:  483 Timestep:  1509
Episode:  484 Timestep:  1509
Episode:  485 Timestep:  1509
Episode:  486 Timestep:  1509
Episode:  487 Timestep:  1509
Episode:  488 Timestep:  1509
Episode:  489 Timestep:  1509
Episode:  490 Timestep:  1509
Episode:  491 Timestep:  1509
Episode:  492 Timestep:  1509
Episode:  493 Timestep:  1509
Episode:  494 Timestep:  1509
Episode:  495 Timestep:  1509
Episode:  496 Timestep:  1509
Episode:  497 Timestep:  1509
Episode:  498 Timestep:  1509
Episode:  499 Timestep:  1509
Episode:  500 Timestep:  1509
Episode:  501 Timestep:  1509
Episode:  502 Timestep:  1509
Episode:  503 Timestep:  1509
Episode:  504 Timestep:  1509
Episode:  505 Timestep:  1509
Episode:  506 Timestep:  1509
Episode:  507 Timestep:  1509
Episode:  508 Timestep:  1509
Episode:  

In [7]:
file_name = stock_name + '_' + str(period) + '_' + str(start_date) + '_' + str(interval) + '_random_replaybuffer.json'
# output the dictionary to a json file
import json
with open(file_name, 'w') as fp:
    json.dump(data, fp)

In [15]:
# train an agent using stable baselines
# import 
# import stable baselines 
import torch.nn as nn
from stable_baselines3 import PPO
from stable_baselines3 import A2C
from stable_baselines3 import DDPG
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# create the environment as a vectorized environment for stable baselines training
env_stable = DummyVecEnv([lambda: StockTradingEnv(stockdata.iloc[0:400], init_balance, max_step, random = False)])

# create a custom policy network for the agent where the input is the state and the output is the action
"""
    Custom network for policy and value function.
    It receives as input the features extracted by the features extractor.

    :param feature_dim: dimension of the features extracted with the features_extractor (e.g. features from a CNN)
    :param last_layer_dim_pi: (int) number of units for the last layer of the policy network
    :param last_layer_dim_vf: (int) number of units for the last layer of the value network
    see https://stable-baselines3.readthedocs.io/en/master/guide/custom_policy.html
"""
class CustomPolicy(nn.Module):
    def __init__(self, features_dim, last_layer_dim_pi, last_layer_dim_vf):
        super(CustomPolicy, self).__init__()
        # save the output dimension
        self.last_layer_dim_pi = last_layer_dim_pi
        self.last_layer_dim_vf = last_layer_dim_vf

        # create the policy network
        self.policy_net = nn.Sequential(
            nn.Linear(features_dim, last_layer_dim_pi), nn.ReLU()
        )

        # create the value network
        self.value_net = nn.Sequential(
            nn.Linear(features_dim, last_layer_dim_vf), nn.ReLU()
        )

    def forward(self, features):
        # policy network
        pi = self.policy_net(features)
        # value network
        vf = self.value_net(features)
        return pi, vf


# create the models
modelPPO = PPO(policy, env_stable, verbose=1)
modelA2C = A2C(policy, env_stable, verbose=1)
modelDDPG = DDPG(policy, env_stable, verbose=1)
# store the models' name in a list
model_list = [modelPPO, modelA2C, modelDDPG]




Using cuda device
Using cuda device
Using cuda device


In [17]:
# investigate the shape of the state
state = env.reset()
print('state shape: ', state.shape)

state shape:  (14,)


In [18]:
# investigate the shape of the action
action = env.action_space.sample()
print('action shape: ', action.shape)
model_action, _state = modelPPO.predict(state, deterministic=True)
print('model action shape: ', model_action.shape)

action shape:  (2,)


ValueError: Error: Unexpected observation shape (14,) for Box environment, please use (8,) or (n_env, 8) for the observation shape.

In [16]:
# train the models
for model in model_list:
    print('Training model: ', model)
    model.learn(total_timesteps=10000)
    print('Model trained')
    print('Evaluating model: ', model)
    mean_reward, std_reward = evaluate_policy(model, env_stable, n_eval_episodes=10)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

Training model:  <stable_baselines3.ppo.ppo.PPO object at 0x7f80a05239a0>
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/home/victoru/anaconda3/envs/testpython/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3398, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_11180/1671012842.py", line 4, in <cell line: 2>
    model.learn(total_timesteps=10000)
  File "/home/victoru/anaconda3/envs/testpython/lib/python3.10/site-packages/stable_baselines3/ppo/ppo.py", line 317, in learn
  File "/home/victoru/anaconda3/envs/testpython/lib/python3.10/site-packages/stable_baselines3/common/on_policy_algorithm.py", line 246, in learn
    while self.num_timesteps < total_timesteps:
  File "/home/victoru/anaconda3/envs/testpython/lib/python3.10/site-packages/stable_baselines3/common/base_class.py", line 489, in _setup_learn
  File "/home/victoru/anaconda3/envs/testpython/lib/python3.10/site-packages/stable_baselines3/common/vec_env/dummy_vec_env.py", line 64, in reset
    def seed(self, seed: Optional[int] = None) -> List

In [None]:
# evaluate the models
for model in model_list:
    mean_reward, std_reward = evaluate_policy(model, env_stable, n_eval_episodes=10)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

In [None]:
# to-do: create a loop to sample state, action from the each models, reward and store in the dictionary
output = []
num_episodes = 500
# loop through the models
for model in model_list:
    data2 = {'data':[]}
    for i in range(num_episodes):
        # create list for storing state, action, reward
        dict = {'state': [], 'action': [], 'reward': [], 'timestep': []}
        # reset the environment
        state = env.reset()
        dict['state'].append(state.tolist())
        timestep = 0
        done = False
        # create a loop to sample action, next_state, reward and store in the dictionary
        while not done:
            # sample action
            action, _states = model.predict(state)
            next_state, reward, done, info = env.step(action)
            # store state, action, reward in the dictionary
            dict['action'].append(action.tolist())
            dict['reward'].append([reward])
            dict['timestep'].append(timestep)
            # update state
            timestep += 1
            state = next_state
            # check if done
            if done:
                print('Episode: ', i, 'Timestep: ', timestep)
                break
            else:
                dict['state'].append(state.tolist())
        
        # store the state, action, reward list in the dictionary
    data2['data'].append(dict)
    output.append(data2)

In [None]:
# loop through the output list and save each dictionary to a json file
for i in range(len(output)):
    file_name = stock_name + '_' + period + '_' + start_date + '_' + interval + '_' + model_list[i].__class__.__name__ + '_replaybuffer.json'
    # output the dictionary to a json file
    import json
    with open(file_name, 'w') as fp:
        json.dump(output[i], fp)
    