In [1]:
!pip install numpy pandas matplotlib torch gym yfinance

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

#Baseline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
import yfinance as yf

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Download historical stock data

In [7]:
def download_stock_data(ticker, start_date, end_date):
    data = yf.download(ticker, start=start_date, end=end_date)
    data.reset_index(inplace=True)
    return data

Define the Trading Environment

In [8]:
class TradingEnv:
    def __init__(self, df):
        self.df = df
        self.n_step = len(df)
        self.action_space = 3  # Buy, Hold, Sell
        self.reset()

    def reset(self):
        self.current_step = 0
        self.balance = 10000  # initial balance
        self.shares_held = 0
        self.total_value = self.balance
        self.total_values = []  # To store portfolio values
        return self._next_observation()

    def _next_observation(self):
        return self.df.iloc[self.current_step].drop('Date').values.astype(np.float32)

    def step(self, action):
        current_price = self.df.iloc[self.current_step]['Close']
        reward = 0

        if action == 0:  # Buy
            self.shares_held += self.balance // current_price
            self.balance %= current_price
        elif action == 2:  # Sell
            self.balance += self.shares_held * current_price
            self.shares_held = 0

        self.current_step += 1
        self.total_value = self.balance + self.shares_held * current_price
        self.total_values.append(self.total_value)  # Store portfolio value

        if self.current_step >= self.n_step - 1:
            done = True
        else:
            done = False

        reward = self.total_value - 10000  # reward based on total portfolio value
        return self._next_observation(), reward, done

    def render(self):
        pass

Define the DQN Agent

In [9]:
class DQNAgent(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQNAgent, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001

        self.model = nn.Sequential(
            nn.Linear(state_size, 24),
            nn.ReLU(),
            nn.Linear(24, 24),
            nn.ReLU(),
            nn.Linear(24, action_size)
        ).to(device)

        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, mode='min', factor=0.1, patience=10, verbose=True)
        self.loss_fn = nn.MSELoss()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state = torch.FloatTensor(state).to(device)
        act_values = self.model(state)
        return torch.argmax(act_values[0]).item()

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                next_state = torch.FloatTensor(next_state).to(device)
                target = reward + self.gamma * torch.max(self.model(next_state)[0]).item()
            target_f = self.model(torch.FloatTensor(state).to(device)).detach().cpu().numpy()
            target_f[0][action] = target
            target_f = torch.FloatTensor(target_f).to(device)
            state = torch.FloatTensor(state).to(device)
            self.optimizer.zero_grad()
            outputs = self.model(state)
            loss = self.loss_fn(outputs, target_f)
            loss.backward()
            self.optimizer.step()
            self.scheduler.step(loss)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_state_dict(torch.load(name))

    def save(self, name):
        torch.save(self.model.state_dict(), name)

Training the DQN Agent

In [None]:
if __name__ == "__main__":
    # Download stock data
    ticker = 'AAPL'
    start_date = '2020-01-01'
    end_date = '2023-01-01'
    df = download_stock_data(ticker, start_date, end_date)

    env = TradingEnv(df)
    state_size = df.shape[1] - 1  # exclude 'Date' column
    action_size = 3
    agent = DQNAgent(state_size, action_size)
    batch_size = 32
    episodes = 1000
    predicted_values = []

    for e in range(episodes):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        for time in range(env.n_step):
            action = agent.act(state)
            next_state, reward, done = env.step(action)
            reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                print(f"episode: {e}/{episodes}, score: {reward}, e: {agent.epsilon:.2}")
                break
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)
        if e % 50 == 0:
            agent.save(f"model_{e}.pth")

        # Store predicted values for the last episode
        if e == episodes - 1:
            predicted_values = env.total_values

    # Extract true values from the dataframe
    true_values = df['Close'].values

    # Plot true values and predicted values
    plt.figure(figsize=(14,7))
    plt.plot(true_values, label='True Value')
    plt.plot(predicted_values, label='Predicted Value')
    plt.xlabel('Time Step')
    plt.ylabel('Portfolio Value')
    plt.legend()
    plt.title("True vs Predicted Portfolio Values")
    plt.show()