 Install the Stable Baselines3 library along with additional optional dependencies, referred to as "extra" dependencies. Stable Baselines3 is a popular library for reinforcement learning (RL) in Python. It provides various RL algorithms and utilities to facilitate the implementation and evaluation of RL agents.

In [None]:
pip install stable-baselines3[extra]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stable-baselines3[extra]
  Downloading stable_baselines3-1.6.2-py3-none-any.whl (170 kB)
[K     |████████████████████████████████| 170 kB 28.7 MB/s 
[?25hCollecting gym==0.21
  Downloading gym-0.21.0.tar.gz (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 27.2 MB/s 
Collecting importlib-metadata~=4.13
  Downloading importlib_metadata-4.13.0-py3-none-any.whl (23 kB)
Collecting rich
  Downloading rich-12.6.0-py3-none-any.whl (237 kB)
[K     |████████████████████████████████| 237 kB 64.4 MB/s 
[?25hCollecting tensorboard>=2.9.1
  Downloading tensorboard-2.10.1-py3-none-any.whl (5.9 MB)
[K     |████████████████████████████████| 5.9 MB 48.6 MB/s 
Collecting ale-py==0.7.4
  Downloading ale_py-0.7.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 48.2 MB/s 
[?25hCollecting autorom[accept-rom-license]~

In [None]:
import gym
import json
import datetime as dt

from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import PPO

#1 - The first environment with discrete actions
This code implements a stock trading environment where the agent takes discrete actions to buy, sell, or hold stocks. The observations are historical stock data for a given number of days (horizon). The agent can only trade whole shares, and the reward is calculated based on the change in the account balance. The environment includes methods for taking actions, getting observations, and resetting the state.

In [None]:
import random
import json
import gym
from gym import spaces
import pandas as pd
import numpy as np

MAX_ACCOUNT_BALANCE = 2147483647
MAX_NUM_SHARES = 2147483647
MAX_SHARE_PRICE = 5000
MAX_OPEN_POSITIONS = 5
MAX_STEPS = 20000

INITIAL_ACCOUNT_BALANCE = 10000


class StockTradingEnv(gym.Env):
    """
    A stock trading environment for OpenAI gym.

    Attributes:
        df (pandas.DataFrame): Historical stock data used as the trading environment.
        reward (float): Current reward (profit or loss) obtained by the agent.
        current_step (int): Current step number in the trading environment.
        opening_account_balance (float): Initial account balance for the trading agent.
        horizon (int): Number of days to use as an observation window for the agent.
        action_space (gym.spaces.Box): Action space representing the buy/sell actions.
        observation_features (list): List of feature names used for observation.
        observation_space (gym.spaces.Box): Observation space for the agent.

    """
    metadata = {'render.modes': ['human']}

    def __init__(self, df):
        """
        Initializes the StockTradingEnv.

        Args:
            df (pandas.DataFrame): Historical stock data used as the trading environment.

        """
        super(StockTradingEnv, self).__init__()
        self.df = df
        self.reward = 0
        self.current_step = 10
        self.opening_account_balance = INITIAL_ACCOUNT_BALANCE
        self.horizon = 10  # Number of days for observation window

        # Actions of the format Buy x%, Sell x%, Hold, etc.
        self.action_space = spaces.Box(low=np.array([-1]), high=np.array([1]), dtype=np.float32)

        self.observation_features = [
            "Close",
            'rsi',
            'macd',
            'atr',
            'osc',
        ]

        self.observation_space = spaces.Box(
            low=0,
            high=1,
            shape=(len(self.observation_features), self.horizon),
            dtype=np.float,
        )

    def get_observation(self):
        """
        Retrieves the observation for the current step.

        Returns:
            np.ndarray: Observation data for the current step.

        """
        observation = (
            self.df.loc[
                self.current_step - self.horizon: self.current_step - 1,
                self.observation_features,
            ]
            .to_numpy()
            .T
        )
        return observation

    def _take_action(self, action):
        """
        Executes the given action.

        Args:
            action (float): Action taken by the agent.

        """
        # Set the current price to a random price within the time step
        if action == 0:  # Indicates "Hold" action
            # Hold position; No trade to be executed
            return

        order_type = "buy" if action <= 1 and action > 0 else "sell"

        order_fraction_of_allowable_shares = abs(action)
        current_price = self.df.loc[self.current_step, "Close"]

        if order_type == "buy":
            allowable_shares = int(self.cash_balance / current_price)
            if allowable_shares > 10:
                allowable_shares = 10
            num_shares_bought = int(allowable_shares)

            current_cost = self.cost_basis * self.num_shares_held
            additional_cost = num_shares_bought * current_price

            self.cash_balance -= additional_cost
            self.cost_basis = (current_cost + additional_cost) / (
                self.num_shares_held + num_shares_bought
            )
            self.num_shares_held += num_shares_bought

        elif order_type == "sell":
            # Simulate a SELL order and execute it at current_price
            num_shares_sold = self.num_shares_held

            if num_shares_sold > 10:
                num_shares_sold == 10
            self.cash_balance += num_shares_sold * current_price
            self.num_shares_held -= num_shares_sold
            sale_proceeds = num_shares_sold * current_price

        if self.num_shares_held == 0:
            self.cost_basis = 0

        # Update account value
        self.account_value = self.cash_balance + self.num_shares_held * current_price

    def step(self, action):
        """
        Executes one step within the trading environment.

        Args:
            action (float): Action taken by the agent.

        Returns:
            np.ndarray: Observation data for the current step.
            float: Reward (profit or loss) obtained by the agent.
            bool: True if the episode is done, False otherwise.
            dict: Additional information (not used in this implementation).

        """
        # Execute the given action
        self._take_action(action)

        self.current_step += 1

        # Calculate reward (profit or loss)
        self.reward = self.account_value - self.opening_account_balance
        done = self.current_step * self.horizon >= len(self.df.loc[:, "Open"].values)

        obs = self.get_observation()

        return obs, self.reward, done, {}

    def reset(self):
        """
        Resets the state of the environment to an initial state.

        Returns:
            np.ndarray: Observation data for the current state.

        """
        self.cash_balance = self.opening_account_balance
        self.account_value = self.opening_account_balance
        self.num_shares_held = 0
        self.cost_basis = 0
        self.current_step = 12
        self.trades = []

        return self.get_observation()

    def render(self, mode='human', close=False):
        """
        Displays the current state of the environment.

        Args:
            mode (str): Render mode (not used in this implementation).
            close (bool): True if rendering is to be closed (not used in this implementation).

        """
        print(f'Step: {self.current_step}')
        print(f'Balance: {self.cash_balance}')
        print(f'Shares held: {self.num_shares_held}')
        print(f'Avg cost for held shares: {self.cost_basis}')
        print(f'Reward: {self.reward}')




**Test**

In [None]:
df= pd.read_csv('AAPL.csv')


In [None]:

df

Unnamed: 0,Date,Open,High,Low,Close,Volume,rsi,macd,atr,osc
0,20191113,65.28,66.19,65.27,66.12,1.032700e+08,0.000000,0.000000,0.000000,0.000000
1,20191112,65.39,65.70,65.23,65.49,8.739000e+07,0.000000,0.000000,0.000000,0.000000
2,20191111,64.58,65.62,64.57,65.55,8.203000e+07,0.000000,0.000000,0.000000,0.000000
3,20191108,64.67,65.11,64.21,65.03,7.008000e+07,0.000000,0.000000,0.000000,0.000000
4,20191107,64.69,65.09,64.53,64.86,9.494000e+07,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
2732,20090108,3.23,3.33,3.22,3.31,6.734600e+08,-7.992641,0.059467,0.118571,0.866667
2733,20090107,3.28,3.30,3.22,3.25,7.572000e+08,-21.860536,0.047635,0.115714,0.851852
2734,20090106,3.43,3.47,3.30,3.32,1.290000e+09,-80.532402,0.032239,0.122857,0.779412
2735,20090105,3.33,3.43,3.31,3.38,1.190000e+09,-38.286621,0.015022,0.124286,0.867647


In [None]:
import gym
import pandas as pd
from stable_baselines3 import PPO
from stable_baselines3.common.policies import ActorCriticPolicy



# Step 1: Create the StockTradingEnv environment
env = StockTradingEnv(df)

# Step 2: Create a Proximal Policy Optimization (PPO) agent using the ActorCriticPolicy
model = PPO(ActorCriticPolicy, env, verbose=1)

# Step 3: Train the agent on the environment for a specified number of timesteps (iterations)
model.learn(total_timesteps=20000)

# Step 4: Test the trained agent in the environment for 1000 steps
obs = env.reset()
for i in range(1000):
    # Step 4a: The agent predicts an action based on the current observation
    action, _states = model.predict(obs)
    print('action:', action)

    # Step 4b: Take the predicted action in the environment and obtain new observations, rewards, and other information
    obs, rewards, done, info = env.step(action)

    # Step 4c: Render the current state of the environment (optional)
    env.render()




  -self.np_random.exponential(size=upp_bounded[upp_bounded].shape)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  def reset(self, **kwargs):


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Avg cost for held shares: 0
reward: -317.2000000000062
action: [-1.]
reward: -317.2000000000062
Step: 299
Balance: 9682.799999999994
Shares held: 0
Avg cost for held shares: 0
reward: -317.2000000000062
action: [-1.]
reward: -317.2000000000062
Step: 300
Balance: 9682.799999999994
Shares held: 0
Avg cost for held shares: 0
reward: -317.2000000000062
action: [-1.]
reward: -317.2000000000062
Step: 301
Balance: 9682.799999999994
Shares held: 0
Avg cost for held shares: 0
reward: -317.2000000000062
action: [0.5540471]
reward: -317.2000000000062
Step: 302
Balance: 9111.899999999994
Shares held: 10
Avg cost for held shares: 57.09000000000001
reward: -317.2000000000062
action: [1.]
reward: -319.0000000000073
Step: 303
Balance: 8542.799999999994
Shares held: 20
Avg cost for held shares: 57.0
reward: -319.0000000000073
action: [0.7623972]
reward: -332.0000000000073
Step: 304
Balance: 7980.199999999993
Shares held: 30
Avg cost for h

In [None]:
if __name__ == "__main__":
    env = StockTradingEnv(df)
    obs = env.reset()
    num_episodes = 40  # Increase num_episodes
    for _ in range(num_episodes):
        action = env.action_space.sample()
        print(action)
        next_obs, reward, done, _ = env.step(action)
        env.render()

[0]
Step: 11
Balance: 10000
Shares held: 0
Avg cost for held shares: 0
reward: 0
[0]
Step: 12
Balance: 10000
Shares held: 0
Avg cost for held shares: 0
reward: 0
[-1]
Step: 13
Balance: 10000.0
Shares held: 0
Avg cost for held shares: 0
reward: 0
[0]
Step: 14
Balance: 10000.0
Shares held: 0
Avg cost for held shares: 0
reward: 0
[-1]
Step: 15
Balance: 10000.0
Shares held: 0
Avg cost for held shares: 0
reward: 0
[-1]
Step: 16
Balance: 10000.0
Shares held: 0
Avg cost for held shares: 0
reward: 0
[1]
Step: 17
Balance: 9400.1
Shares held: 10
Avg cost for held shares: 59.989999999999995
reward: 0
[1]
Step: 18
Balance: 8798.800000000001
Shares held: 20
Avg cost for held shares: 60.06
reward: 0
[-1]
Step: 19
Balance: 9980.800000000001
Shares held: 0
Avg cost for held shares: 0
reward: 0
[-1]
Step: 20
Balance: 9980.800000000001
Shares held: 0
Avg cost for held shares: 0
reward: 0
[0]
Step: 21
Balance: 9980.800000000001
Shares held: 0
Avg cost for held shares: 0
reward: 0
[-1]
Step: 22
Balance: 9

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


# 2- the second environment with continuous actions
This code also implements a stock trading environment, but with some differences from the first snippet. Here, the agent takes continuous actions in the form of percentages (0% to 100%) of buying/selling shares. The observation space contains the last five OHLC (Open, High, Low, Close) prices, and additional features like balance, net worth, number of shares held, etc. The agent can buy/sell fractional shares, and the reward is calculated based on the balance multiplied by a delay modifier. The environment also includes methods for taking actions, getting observations, and resetting the state.

In [None]:
import random
import json
import gym
from gym import spaces
import pandas as pd
import numpy as np

MAX_ACCOUNT_BALANCE = 2147483647
MAX_NUM_SHARES = 2147483647
MAX_SHARE_PRICE = 5000
MAX_OPEN_POSITIONS = 5
MAX_STEPS = 20000

INITIAL_ACCOUNT_BALANCE = 10000


class StockTradingEnv(gym.Env):
    """
    A stock trading environment for OpenAI gym.

    Attributes:
        df (pandas.DataFrame): Historical stock data used as the trading environment.
        action_space (gym.spaces.Box): Action space representing the buy/sell actions.
        observation_space (gym.spaces.Box): Observation space for the agent.
        balance (float): Current account balance.
        net_worth (float): Net worth of the account (balance + value of held shares).
        max_net_worth (float): Maximum net worth achieved so far.
        shares_held (int): Number of shares currently held.
        cost_basis (float): Average cost basis of the held shares.
        total_shares_sold (int): Total number of shares sold during trading.
        total_sales_value (float): Total value obtained from selling shares.
        current_step (int): Current step number in the trading environment.
        reward_range (tuple): Tuple representing the range of possible rewards.

    """
    metadata = {'render.modes': ['human']}

    def __init__(self, df):
        """
        Initializes the StockTradingEnv.

        Args:
            df (pandas.DataFrame): Historical stock data used as the trading environment.

        """
        super(StockTradingEnv, self).__init__()

        self.df = df
        self.reward_range = (0, MAX_ACCOUNT_BALANCE)

        # Actions of the format Buy x%, Sell x%, Hold, etc.
        self.action_space = spaces.Box(
            low=np.array([0, 0]), high=np.array([3, 1]), dtype=np.float16)

        # Prices contain the OHCL values for the last five prices
        self.observation_space = spaces.Box(
            low=0, high=1, shape=(6, 6))

        # Initialize environment-specific variables
        self.balance = INITIAL_ACCOUNT_BALANCE
        self.net_worth = INITIAL_ACCOUNT_BALANCE
        self.max_net_worth = INITIAL_ACCOUNT_BALANCE
        self.shares_held = 0
        self.cost_basis = 0
        self.total_shares_sold = 0
        self.total_sales_value = 0
        self.current_step = random.randint(
            0, len(self.df.loc[:, 'Open'].values) - 6)

    def _next_observation(self):
        """
        Generates the next observation for the agent.

        Returns:
            np.ndarray: Observation data for the next step.

        """
        # Get the stock data points for the last 5 days and scale to between 0-1
        frame = np.array([
            self.df.loc[self.current_step: self.current_step +
                        5, 'Open'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step +
                        5, 'High'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step +
                        5, 'Low'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step +
                        5, 'Close'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step +
                        5, 'Volume'].values / MAX_NUM_SHARES,
        ])

        # Append additional data and scale each value to between 0-1
        obs = np.append(frame, [[
            self.balance / MAX_ACCOUNT_BALANCE,
            self.max_net_worth / MAX_ACCOUNT_BALANCE,
            self.shares_held / MAX_NUM_SHARES,
            self.cost_basis / MAX_SHARE_PRICE,
            self.total_shares_sold / MAX_NUM_SHARES,
            self.total_sales_value / (MAX_NUM_SHARES * MAX_SHARE_PRICE),
        ]], axis=0)

        return obs

    def _take_action(self, action):
        """
        Executes the given action.

        Args:
            action (np.ndarray): Action taken by the agent.

        """
        # Set the current price to a random price within the time step
        current_price = random.uniform(
            self.df.loc[self.current_step, "Open"], self.df.loc[self.current_step, "Close"])

        action_type = action[0]
        amount = action[1]

        if action_type < 1:
            # Buy amount % of balance in shares
            total_possible = int(self.balance / current_price)
            shares_bought = int(total_possible * amount)
            prev_cost = self.cost_basis * self.shares_held
            additional_cost = shares_bought * current_price

            self.balance -= additional_cost
            self.cost_basis = (
                prev_cost + additional_cost) / (self.shares_held + shares_bought)
            self.shares_held += shares_bought

        elif action_type < 2:
            # Sell amount % of shares held
            shares_sold = int(self.shares_held * amount)
            self.balance += shares_sold * current_price
            self.shares_held -= shares_sold
            self.total_shares_sold += shares_sold
            self.total_sales_value += shares_sold * current_price

        self.net_worth = self.balance + self.shares_held * current_price

        if self.net_worth > self.max_net_worth:
            self.max_net_worth = self.net_worth

        if self.shares_held == 0:
            self.cost_basis = 0

    def step(self, action):
        """
        Executes one time step within the environment.

        Args:
            action (np.ndarray): Action taken by the agent.

        Returns:
            np.ndarray: The next observation for the agent.
            float: The reward obtained from the action.
            bool: A flag indicating if the episode is done.
            dict: Additional information (empty in this case).

        """
        # Execute one time step within the environment
        self._take_action(action)

        self.current_step += 1

        if self.current_step > len(self.df.loc[:, 'Open'].values) - 6:
            self.current_step = 0

        delay_modifier = (self.current_step / MAX_STEPS)

        reward = self.balance * delay_modifier
        done = self.net_worth <= 0

        obs = self._next_observation()

        return obs, reward, done, {}

    def reset(self):
        """
        Resets the state of the environment to an initial state.

        Returns:
            np.ndarray: The initial observation for the agent.

        """
        # Reset the state of the environment to an initial state
        self.balance = INITIAL_ACCOUNT_BALANCE
        self.net_worth = INITIAL_ACCOUNT_BALANCE
        self.max_net_worth = INITIAL_ACCOUNT_BALANCE
        self.shares_held = 0
        self.cost_basis = 0
        self.total_shares_sold = 0
        self.total_sales_value = 0

        # Set the current step to a random point within the data frame
        self.current_step = random.randint(
            0, len(self.df.loc[:, 'Open'].values) - 6)

        return self._next_observation()

    def render(self, mode='human', close=False):
        """
        Renders the environment to the screen.

        Args:
            mode (str): Rendering mode (not used in this implementation).
            close (bool): Close the rendering (not used in this implementation).

        """
        # Render the environment to the screen
        profit = self.net_worth - INITIAL_ACCOUNT_BALANCE





# 3- The third environment with continuous actions and different observation features
This code implements another continuous stock trading environment with different observation features. It uses rolling means (SMA) and exponential moving averages (EMA) of the Close price as features. The agent takes continuous actions ranging from -1 to 1, indicating sell to buy actions. The agent can buy fractional shares with a maximum of 10 shares per transaction. The reward is calculated as the difference between the final account value and the opening account balance. The environment includes methods for taking actions, getting observations, and resetting the state.

In [None]:
env_config = {
    "ticker": "TSLA",
    "opening_account_balance": 10000,
    # Number of steps (days) of data provided to the agent in one observation
    "observation_horizon_sequence_length": 10,
}

class StockTradingContinuousEnv(gym.Env):
    def __init__(self, env_config: Dict = env_config):
        """
        Initializes the StockTradingContinuousEnv.

        Args:
            env_config (Dict): A dictionary containing environment configuration settings.

        """
        super(StockTradingContinuousEnv, self).__init__()
        self.ticker = env_config.get("ticker", "MSFT")
        self.current_step = 10
        data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
        self.ticker_file_stream = os.path.join(f"{data_dir}", f"{self.ticker}.csv")
        assert os.path.isfile(
            self.ticker_file_stream
        ), f"Historical stock data file stream not found at: data/{self.ticker}.csv"

        self.ohlcv_df = pd.read_csv(self.ticker_file_stream)
        self.ohlcv_df['SMA_10'] = self.ohlcv_df['Close'].rolling(10).mean()
        self.ohlcv_df['SMA_21'] = self.ohlcv_df['Close'].rolling(21).mean()
        self.ohlcv_df['SMA_50'] = self.ohlcv_df['Close'].rolling(50).mean()
        self.ohlcv_df['EMA_10'] = self.ohlcv_df['Close'].ewm(span=10, adjust=False).mean()
        self.ohlcv_df['EMA_21'] = self.ohlcv_df['Close'].ewm(span=21, adjust=False).mean()
        self.ohlcv_df['EMA_50'] = self.ohlcv_df['Close'].ewm(span=50, adjust=False).mean()

        self.ohlcv_df.dropna(inplace=True)
        self.ohlcv_df.reset_index(drop=True, inplace=True)
        print(self.ohlcv_df)
        self.opening_account_balance = env_config["opening_account_balance"]

        self.action_space = spaces.Box(
            low=np.array([-1]), high=np.array([1]), dtype=np.float
        )

        self.observation_features = [
            "Close",
            'SMA_10',
            'SMA_21',
            'SMA_50',
            'EMA_10',
            'EMA_21',
            'EMA_50'
        ]

        self.horizon = env_config.get("observation_horizon_sequence_length")
        self.observation_space = spaces.Box(
            low=0,
            high=1,
            shape=(len(self.observation_features), self.horizon + 1),
            dtype=np.float,
        )

    def step(self, action):
        """
        Executes one step within the trading environment.

        Args:
            action (float): The action taken by the agent (buy/sell/hold).

        Returns:
            np.ndarray: The next observation for the agent.
            float: The reward obtained from the action.
            bool: A flag indicating if the episode is done.
            dict: Additional information (empty in this case).

        """
        # Execute one step within the trading environment
        self.execute_trade_action(action)

        self.current_step += 1

        reward = self.account_value - self.opening_account_balance  # Profit (loss)
        done = self.current_step * self.horizon >= len(
            self.ohlcv_df.loc[:, "Open"].values
        )

        obs = self.get_observation()

        return obs, reward, done, {}

    def reset(self):
        """
        Resets the state of the environment to an initial state.

        Returns:
            np.ndarray: The initial observation for the agent.

        """
        # Reset the state of the environment to an initial state
        self.cash_balance = self.opening_account_balance
        self.account_value = self.opening_account_balance
        self.num_shares_held = 0
        self.cost_basis = 0
        self.current_step = 10
        self.trades = []

        return self.get_observation()

    def get_observation(self):
        """
        Generates the current observation for the agent.

        Returns:
            np.ndarray: The current observation for the agent.

        """
        observation = (
            self.ohlcv_df.loc[
                self.current_step - self.horizon : self.current_step - 1,
                self.observation_features,
            ]
            .to_numpy()
            .T
        )
        return observation

    def execute_trade_action(self, action):
        """
        Executes the given trade action (buy/sell/hold).

        Args:
            action (float): The action taken by the agent (buy/sell/hold).

        """
        if action == 0:  # Indicates "Hold" action
            # Hold position; No trade to be executed
            return

        order_type = "buy" if action > 1 else "sell"
        order_fraction_of_allowable_shares = abs(action)
        current_price = self.ohlcv_df.loc[self.current_step, "Close"]

        if order_type == "buy":
            allowable_shares = int(self.cash_balance / current_price)
            if allowable_shares > 10:
                allowable_shares = 10
            num_shares_bought = int(allowable_shares)

            current_cost = self.cost_basis * self.num_shares_held
            additional_cost = num_shares_bought * current_price

            self.cash_balance -= additional_cost
            self.cost_basis = (current_cost + additional_cost) / (
                self.num_shares_held + num_shares_bought
            )
            self.num_shares_held += num_shares_bought

        elif order_type == "sell":
            # Simulate a SELL order and execute it at current_price
            num_shares_sold = self.num_shares_held

            if num_shares_sold > 10:
                num_shares_sold = 10
            self.cash_balance += num_shares_sold * current_price
            self.num_shares_held -= num_shares_sold
            sale_proceeds = num_shares_sold * current_price

        if self.num_shares_held == 0:
            self.cost_basis = 0

        # Update account value
        self.account_value = self.cash_balance + self.num_shares_held * current_price

if __name__ == "__main__":
    env = StockTradingContinuousEnv()
    obs = env.reset()
    num_episodes = 2  # Increase num_episodes
    for _ in range(num_episodes):
        action = env.action_space.sample()
        next_obs, reward, done, _ = env.step(action)
        env.render()