In [None]:
pip install pandas_ta stable_baselines3 shimmy

Collecting pandas_ta
  Downloading pandas_ta-0.3.14b.tar.gz (115 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/115.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m112.6/115.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.1/115.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting stable_baselines3
  Downloading stable_baselines3-2.3.2-py3-none-any.whl.metadata (5.1 kB)
Collecting shimmy
  Downloading Shimmy-2.0.0-py3-none-any.whl.metadata (3.5 kB)
Collecting gymnasium<0.30,>=0.28.1 (from stable_baselines3)
  Downloading gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
INFO: pip is looking at multiple versions of shimmy to determine which version is compatible with other requirements. This could take a while.
Collecting shimmy
  Downloading Shimmy-1.3.0-py3-none-

In [None]:
import pandas as pd
import yfinance as yf

# function to get stock from yfinance
def get_stock_data_with_indicators(stock_symbols):
    stock_data = pd.DataFrame()

    # --- loop to get HLOC and compute indicator ema10 and ema60 + RSI
    for i, symbol in enumerate(stock_symbols):

        # download data
        data = yf.download(symbol, period="10y", interval="1d")

        # HLOC
        stock_data[f'low_{i}'] = data['Low']
        stock_data[f'close_{i}'] = data['Close']
        stock_data[f'high_{i}'] = data['High']
        stock_data[f'open_{i}'] = data['Open']
        stock_data[f'volume_{i}'] = data['Volume']

        # ema10 and ema60
        stock_data[f'ema10_{i}'] = data['Close'].ewm(span=10, adjust=False).mean()
        stock_data[f'ema60_{i}'] = data['Close'].ewm(span=60, adjust=False).mean()

        # RSI
        delta = data['Close'].diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
        rs = gain / loss
        stock_data[f'rsi_{i}'] = 100 - (100 / (1 + rs))

    # add week infos
    stock_data['week'] = data.index.isocalendar().week  # Add week number info

    # important: handle NaN values
    stock_data.fillna(method='ffill', inplace=True)  # Forward fill NaN values
    stock_data.fillna(method='bfill', inplace=True)  # Backward fill if any initial NaN values remain

    return stock_data.reset_index(drop=True)

# stocks to download
stock_symbols = [
    "NQ=F"
]

# create the stock_data for agent
stock_data = get_stock_data_with_indicators(stock_symbols)

[*********************100%***********************]  1 of 1 completed
  stock_data.fillna(method='ffill', inplace=True)  # Forward fill NaN values
  stock_data.fillna(method='bfill', inplace=True)  # Backward fill if any initial NaN values remain


In [None]:
import gym
import numpy as np
import pandas as pd
import yfinance as yf
from stable_baselines3 import PPO
from gym import spaces

class StockEnv(gym.Env):
    def __init__(self, stock_data, initial_equity=100000, max_bets=10, history_days=30):
        super(StockEnv, self).__init__()
        self.stock_data = stock_data.reset_index(drop=True)  # index reset
        self.current_day = history_days  # shift to have historic data
        self.max_bets = max_bets
        self.max_days = len(stock_data)
        self.history_days = history_days  # past data (now in days)

        # equity initialization
        self.equity = initial_equity

        # get num of stocks (important: implied 8 features for stocks)
        self.num_stocks = (len(stock_data.columns) - 1) // 8

        # action space is binary, bet or not
        self.action_space = spaces.MultiBinary(self.num_stocks)

        # observation space: past 30-day data (8 features per stock for history_days) + equity
        self.observation_space = spaces.Box(
            low=-np.inf,
            high=np.inf,
            shape=((self.num_stocks * 8 * self.history_days) + 1,),
            dtype=np.float32
        )

    def reset(self):
        # reset environment to initial space
        self.current_day = self.history_days
        self.equity = 100000  # reset equity
        return self._get_observation()  # get current observation

    def step(self, action):
        # ensure action is valid (binary for each stock)
        action = np.clip(action, 0, 1)

        # get current day's data
        current_data = self.stock_data.iloc[self.current_day]
        next_data = self.stock_data.iloc[self.current_day + 1] if self.current_day + 1 < self.max_days else None

        reward = 0
        done = False

        # loop to check if current close price is above EMA10 for each stock (added condition)
        for i in range(self.num_stocks):
            close_this_day = current_data[f'close_{i}']
            ema10_this_day = current_data[f'ema10_{i}']

            if action[i] == 1:  # agent has bet on this stock
                # allow trading only if close price is above EMA10
                if close_this_day > 0: #quick fix to get rid of the ema
                    low_next_day = next_data[f'low_{i}'] if next_data is not None else None
                    close_next_day = next_data[f'close_{i}'] if next_data is not None else None

                    if low_next_day is not None:
                        # check stop-loss condition
                        if low_next_day > current_data[f'low_{i}']:
                            reward -= 1000  # loss of 1%
                            self.equity -= 1000  # deducted from equity
                        else:
                            # avoid division by zero by checking close_this_day - low_this_day
                            denominator = close_this_day - current_data[f'low_{i}']
                            if denominator == 0:
                                ratio = 0  # assign a default value
                            else:
                                ratio = (close_next_day - close_this_day) / denominator

                            gain = ratio * 1000
                            reward += gain*10
                            self.equity += gain  # add gain to equity if win

        # if agent picks no stocks, apply a small penalty (push the agent to explore)
        if sum(action) == 0:
            reward -= 1000  # small penalty

        # move to the next day
        self.current_day += 1

        # check if the equity is below zero (end of the game)
        if self.equity <= 0:
            done = True
            reward -= 100000  # extra penalty for going bankrupt

        if self.current_day >= self.max_days - 1:
            done = True

        # get next observation (for the next day)
        next_observation = self._get_observation()

        return next_observation, reward, done, {}

    def _get_observation(self):
        # get the last `history_days` of data for each stock and current equity
        history_data = self.stock_data.iloc[self.current_day - self.history_days: self.current_day]
        obs = []

        # loop to get the features for each stock
        for i in range(self.history_days):
            day_data = history_data.iloc[i].values[:-1]  # exclude 'date' column, include all other features
            obs.extend(day_data)

        # add current equity to the observation
        obs.append(self.equity)

        return np.array(obs, dtype=np.float32)

    def render(self, mode='human'):
        print(f"Day: {self.current_day}, Equity: {self.equity}")

# ----- Agents

# initialize environment (you would load `stock_data` appropriately with historical stock price data)
env = StockEnv(stock_data)

# train using PPO
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=100000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 166      |
|    ep_rew_mean     | -8.1e+05 |
| time/              |          |
|    fps             | 130      |
|    iterations      | 1        |
|    time_elapsed    | 15       |
|    total_timesteps | 2048     |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 168           |
|    ep_rew_mean          | -8.05e+05     |
| time/                   |               |
|    fps                  | 129           |
|    iterations           | 2             |
|    time_elapsed         | 31            |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 4.2139436e-06 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.693        |
|    explained_variance   | 7.75e-07      |


In [None]:
# test the agent
obs = env.reset()
for _ in range(250):
    action, _states = model.predict(obs)
    obs, reward, done, info = env.step(action)
    env.render()  # render current day and equity
    if done:
        break