In [1]:
import gym
from gym import spaces
import yfinance as yf
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Define the Gym Environment
class StockTradingEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, data, lstm_models, scalers):
        super(StockTradingEnv, self).__init__()

        # Data is now a dictionary mapping tickers to their DataFrame
        self.data = data
        # LSTM models and scalers for each of the top stocks
        self.lstm_models = lstm_models
        self.scalers = scalers

        # Define the action space as a vector where each element corresponds to an action for a stock
        # For simplicity, let's define each action as the amount of money to invest in each stock
        self.action_space = spaces.Box(low=0, high=1, shape=(len(lstm_models),), dtype=np.float32)

        # Observation space will be a concatenation of all stock states and their predictions
        obs_dim = sum(len(df.columns) + 1 for df in data.values())  # +1 for each prediction
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(obs_dim,), dtype=np.float32)

        self.initial_balance = 10000
        self.reset()

    def reset(self):
        self.balance = self.initial_balance
        self.index = {ticker: 0 for ticker in self.data}  # Track index for each stock
        self.done = False
        self.total_reward = 0
        return self._next_observation()


    def set_new_models_scalers(self, new_lstm_models, new_scalers, top_tickers):
        self.lstm_models = new_lstm_models
        self.scalers = new_scalers
        self.top_tickers = top_tickers
        print("Environment updated with new top stocks:", top_tickers)
        self.reset()


    def _next_observation(self):
        # Concatenate states and predictions for all stocks
        obs = np.concatenate([
            np.append(self.data[ticker].iloc[index].values, self._predict_next_close(ticker, index))
            for ticker, index in self.index.items()
        ])
        return obs

    def _predict_next_close(self, ticker, index):
        if index < 5:
            return 0  # Not enough data to predict

        df = self.data[ticker]
        last_sequence = df['Close'].iloc[index-5:index].values.reshape(-1, 1)
        last_sequence_scaled = self.scalers[ticker].transform(last_sequence)
        last_sequence_reshaped = np.reshape(last_sequence_scaled, (1, 5, 1))

        predicted_scaled = self.lstm_models[ticker].predict(last_sequence_reshaped)
        predicted = self.scalers[ticker].inverse_transform(predicted_scaled)[0, 0]
        return predicted

    # ... class definition ...

    def step(self, action):
        reward = 0
        done = False

        # Update balance and reward for each stock based on the action taken
        for ticker, investment_ratio in zip(self.lstm_models.keys(), action):
            current_index = self.index[ticker]
            current_row = self.data[ticker].iloc[current_index]
            next_index = min(current_index + 1, len(self.data[ticker]) - 1)
            next_row = self.data[ticker].iloc[next_index]

            # Calculate investment and return for this stock
            investment = self.balance * investment_ratio
            stock_return = (next_row['Close'] - current_row['Open']) / current_row['Open'] * investment
            reward += stock_return

            self.index[ticker] = next_index
            if next_index >= len(self.data[ticker]) - 1:
                done = True  # End the episode if we run out of data for any stock

        self.balance += reward  # Update balance with the combined reward from all actions
        self.total_reward += reward

        # If the episode is done and we haven't made any money, penalize the agent
        if done and self.total_reward <= 0:
            reward -= 1

        next_state = self._next_observation()

        return next_state, reward, done, {}

# Function to fetch data from Yahoo Finance
def fetch_data(tickers, start_date, end_date):
    data = {}
    for ticker in tickers:
        stock_data = yf.download(ticker, start=start_date, end=end_date, progress=False)
        stock_data['Return'] = stock_data['Close'].pct_change()
        data[ticker] = stock_data.dropna()
    return data

# Function to train LSTM models
def train_lstm_models(data):
    lstm_models = {}
    scalers = {}

    for ticker, df in data.items():
        df = df.dropna()  # Ensure there are no NaN values in the data

        # Scale the 'Close' column
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaled_close = scaler.fit_transform(df[['Close']])

        # Create sequences for LSTM training
        sequence_length = 5
        X, y = [], []
        for i in range(sequence_length, len(df)):
            X.append(scaled_close[i - sequence_length:i, 0])
            y.append(scaled_close[i, 0])

        # Convert to numpy arrays and reshape for LSTM
        X, y = np.array(X), np.array(y)
        X = X.reshape((X.shape[0], X.shape[1], 1))

        # Build the LSTM model
        model = Sequential([
            LSTM(units=50, return_sequences=True, input_shape=(X.shape[1], 1)),
            LSTM(units=50),
            Dense(units=1)
        ])
        model.compile(optimizer='adam', loss='mean_squared_error')

        # Train the model
        model.fit(X, y, epochs=20, batch_size=32, verbose=0)

        lstm_models[ticker] = model
        scalers[ticker] = scaler

    return lstm_models, scalers

# Sort the tickers by performance and pick the top 5
def sort_stocks_by_performance(data):
    performance = {ticker: df['Return'].sum() for ticker, df in data.items()}
    sorted_tickers = sorted(performance, key=performance.get, reverse=True)
    top_tickers = sorted_tickers[:5]
    print("Top 5 performing stocks:", top_tickers)
    return top_tickers


# Main code to create the environment and train models
tickers = [
    'AAPL', 'MSFT', 'GOOG', 'AMZN','TSLA', 'BRK-B', 'JNJ', 'JPM', 'V', 'PG', 'UNH', 'DIS', 'NVDA', 'HD',
    'PYPL', 'BAC', 'VZ', 'ADBE', 'CMCSA', 'NFLX', 'KO', 'PFE', 'NKE', 'T', 'ABT', 'PEP', 'CVX', 'ORCL', 'CSCO',
    'XOM', 'ACN', 'TMO', 'AVGO', 'QCOM', 'COST', 'C', 'LLY', 'WFC', 'DHR', 'MCD', 'MDT', 'INTC', 'TXN', 'HON',
    'UNP', 'BMY', 'LIN', 'BA', 'AMGN', 'IBM', 'GE', 'MMM', 'SBUX', 'RTX', 'CAT', 'DE', 'GS', 'MS', 'CVS', 'MMM'
]

start_date = "2021-01-01"
end_date = "2023-01-01"

data = fetch_data(tickers, start_date, end_date)
top_tickers = sort_stocks_by_performance(data)
top_data = {ticker: data[ticker] for ticker in top_tickers}
lstm_models, scalers = train_lstm_models(top_data)

# Create a single environment that manages a portfolio of the top stocks
env = StockTradingEnv(top_data, lstm_models, scalers)

from stable_baselines3 import A2C
from stable_baselines3.common.env_checker import check_env

# Check if the environment follows Gym API
#env = StockTradingEnv(top_data, lstm_models, scalers)
check_env(env)

# ... previous code ...

# Initialize the A2C agent from Stable Baselines3
agent = A2C("MlpPolicy", env, verbose=1)

# Define update frequency for the training loop to re-evaluate top stocks
update_frequency = 10  # For example, update every 10 episodes

# Train the agent
total_timesteps = 0
num_episodes = 20  # Set the number of episodes for training
episode_rewards = []  # To store rewards for each episode, for analysis and plotting


for episode in range(num_episodes):
    state = env.reset()
    total_episode_reward = 0
    steps = 0

    while True:
        action, _states = agent.predict(state, deterministic=True)
        next_state, reward, done, _ = env.step(action)
        agent.rollout_buffer.add(state, action, reward, _states, done, next_state)  # Add to buffer
        state = next_state
        total_episode_reward += reward
        steps += 1

        if done:
              episode_rewards.append(total_episode_reward)
              print(f"Episode: {episode + 1}, Total Reward: {total_episode_reward}, Steps: {steps}")
              break

        total_timesteps += steps

  # The learning process for A2C in Stable Baselines3 is integrated into the predict method.
  # After the episodes are done, you typically call the learn method like this:
agent.learn(total_timesteps=total_timesteps)

 # Update top stocks and retrain LSTM models periodically

if episode % update_frequency == 0 and episode > 0:
            print("Updating top stocks and retraining models...")
            data = fetch_data(tickers, start_date, end_date)
            top_tickers = sort_stocks_by_performance(data)
            top_data = {ticker: data[ticker] for ticker in top_tickers}
            lstm_models, scalers = train_lstm_models(top_data)
            env.set_new_models_scalers(lstm_models, scalers, top_tickers)


# Learn outside the loop for the specified total timesteps
agent.learn(total_timesteps=total_timesteps)

# After training, you can analyze the performance of the agent
# For example, you could plot the episode_rewards list to see the learning curve


ModuleNotFoundError: No module named 'gym'

In [1]:
!pip install gym --upgrade




In [None]:
!pip install stable-baselines3

In this code, the environment now manages a portfolio of stocks instead of just one. The action_space is redefined to allocate a fraction of the balance to each stock, and the observation_space includes information from all top stocks. The step function updates the balance based on the investment action taken for each stock.

The reward is calculated as the sum of returns from all stocks in the portfolio. If any stock runs out of data (i.e., reaches the end of the DataFrame), the episode ends (done is set to True).

To use this environment, create an instance by passing the data, models, and scalers for the top stocks, and then integrate it with your RL agent's training loop. The RL agent will need to be capable of handling the multi-dimensional action space where each dimension corresponds to a stock in the portfolio.

Please make sure that the LSTM models are properly trained and the scalers are fit to the data before you create the environment instance. The training loop will need to be adjusted to handle the new action space's structure and the multi-stock portfolio's state representation.

In [11]:
pip install gymnasium




In [12]:

import gymnasium as gym


In [3]:
!pip install stable-baselines3[extra]


Collecting shimmy[atari]~=1.1.0 (from stable-baselines3[extra])
  Downloading Shimmy-1.1.0-py3-none-any.whl (37 kB)
Collecting autorom[accept-rom-license]~=0.6.1 (from stable-baselines3[extra])
  Downloading AutoROM-0.6.1-py3-none-any.whl (9.4 kB)
Collecting AutoROM.accept-rom-license (from autorom[accept-rom-license]~=0.6.1->stable-baselines3[extra])
  Downloading AutoROM.accept-rom-license-0.6.1.tar.gz (434 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m434.7/434.7 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting ale-py~=0.8.1 (from shimmy[atari]~=1.1.0->stable-baselines3[extra])
  Downloading ale_py-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m45.4 MB/s[0m eta [36m0:0