# Toy version

In [3]:
import pandas as pd
import numpy as np
import gym
from gym import spaces

In [4]:
df = pd.read_csv('data/INTC_1Min_2023-08-01_2024-01-31.csv')
df.set_index('timestamp', inplace=True)

In [5]:
class TradingEnv(gym.Env):
    def __init__(self, df, initial_balance=10000):
        super(TradingEnv, self).__init__()

        self.df = df
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0 # 0: no position, 1: long, -1: short
        self.current_step = 0

        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32) # buy, sell, hold, relations are set in step function
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(len(df.columns),), dtype=np.float32)

        self.total_reward = 0

    def reset(self):
        self.balance = self.initial_balance
        self.position = 0
        self.current_step = 0
        return self._next_observation()

    def _next_observation(self):
        obs = self.df.iloc[self.current_step].values
        return obs
    
    def step(self, action):
        self.current_step += 1

        action = action[0]
        
        if action > 0.1: # buy
            self.position = 1 # long
        elif action < -0.1: # sell
            self.position = -1 # short
        else:
            self.position = self.position # hold
        
        price_change = self.df.iloc[self.current_step]['close'] - self.df.iloc[self.current_step - 1]['close']
        reward = self.position * price_change

        self.balance += reward
        self.total_reward += reward

        done = self.current_step >= len(self.df) - 1 # stop if we reach the end of the data
        obs = self._next_observation()

        return obs, reward, done, {}
    
    def rend(self):
        print(f'Step: {self.current_step}')
        print(f'Balance: {self.balance}')
        print(f'Position: {self.position}')
        print(f'Total Reward: {self.total_reward}')


In [6]:
from stable_baselines3 import DDPG
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.noise import NormalActionNoise

In [7]:
print(df.columns)

Index(['open', 'high', 'low', 'close', 'volume', 'trade_count', 'vwap'], dtype='object')


In [8]:
env = TradingEnv(df)
env = DummyVecEnv([lambda: env])



In [9]:
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1*np.ones(n_actions))

In [11]:
model = DDPG("MlpPolicy", env, verbose=1, action_noise=action_noise, device='cuda')

# Train the model
model.learn(total_timesteps=10000, progress_bar=True)

# Save the model
model.save("trading_model")

Output()

Using cuda device


In [12]:
from stable_baselines3.common.evaluation import evaluate_policy

mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=1, render=False)
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")



Mean reward: 6.93 +/- 0.00
