<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# Reinforcement Learning for Finance

**Chapter 06 &mdash; Algorithmic Trading**

&copy; Dr. Yves J. Hilpisch

<a href="https://tpq.io" target="_blank">https://tpq.io</a> | <a href="https://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>

## Prediction Game Revisited

In [1]:
import math
import random
import numpy as np
import pandas as pd
from pylab import plt, mpl
import torch

In [2]:
plt.style.use('seaborn-v0_8')
mpl.rcParams['figure.dpi'] = 300
mpl.rcParams['savefig.dpi'] = 300
mpl.rcParams['font.family'] = 'serif'
np.set_printoptions(suppress=True)

In [3]:
from finance import *

In [4]:
from finance import *

In [5]:
from finance import *

finance = Finance('GLD', 'r', min_accuracy=47.5, n_features=8)
#finance.data[finance.symbol].plot(title=finance.symbol, lw=1.0, c='b');

In [6]:
from dqlagent_pytorch import *

In [7]:
random.seed(100)
np.random.seed(100)
torch.manual_seed(100)

<torch._C.Generator at 0x297b9f99390>

In [8]:
dqlagent = DQLAgent(finance.symbol, finance.feature,
                 finance.n_features, finance, lr=0.0001)

In [9]:
%time dqlagent.learn(500)

episode=   1 | treward=  4.000 | max=  4.000
episode=   2 | treward=  6.000 | max=  6.000
episode=   3 | treward=  4.000 | max=  6.000
episode=   4 | treward=  3.000 | max=  6.000
episode=   5 | treward=  4.000 | max=  6.000
episode=   6 | treward=  0.000 | max=  6.000
episode=   7 | treward=  6.000 | max=  6.000
episode=   8 | treward=  8.000 | max=  8.000
episode=   9 | treward=  3.000 | max=  8.000
episode=  10 | treward=  3.000 | max=  8.000
episode=  11 | treward=  4.000 | max=  8.000
episode=  12 | treward=  3.000 | max=  8.000
episode=  13 | treward=  5.000 | max=  8.000
episode=  14 | treward=  5.000 | max=  8.000
episode=  15 | treward=  3.000 | max=  8.000
episode=  16 | treward=  3.000 | max=  8.000
episode=  17 | treward=  3.000 | max=  8.000
episode=  18 | treward=  5.000 | max=  8.000
episode=  19 | treward=  5.000 | max=  8.000
episode=  20 | treward=  5.000 | max=  8.000
episode=  21 | treward=  2.000 | max=  8.000
episode=  22 | treward=  4.000 | m

In [10]:
dqlagent.test(3)

total reward=2507 | accuracy=0.517
total reward=2507 | accuracy=0.525
total reward=2507 | accuracy=0.499



In [11]:
from simulation import Simulation

In [12]:
random.seed(500)

In [13]:
simulation = Simulation('SYMBOL', 'r', 4, '2025-1-1', '2027-1-1',
                2 * 252, min_accuracy=0.5, x0=1, kappa=1,
                theta=0.75, sigma=0.1, new=True, normalize=True)

In [14]:
for t in range(5):
    simulation.reset()
    #simulation.data[simulation.symbol].plot(title=simulation.symbol, lw=1.0, c='b')

In [15]:
random.seed(100)
np.random.seed(100)
torch.manual_seed(100)


<torch._C.Generator at 0x297b9f99390>

In [16]:
agent = DQLAgent(simulation.symbol, simulation.feature,
                 simulation.n_features, simulation)

In [17]:
%time agent.learn(250)

episode=   1 | treward= 12.000 | max= 12.000
episode=   2 | treward=  6.000 | max= 12.000
episode=   3 | treward= 14.000 | max= 14.000
episode=   4 | treward=259.000 | max=259.000
episode=   5 | treward=  5.000 | max=259.000
episode=   6 | treward=  8.000 | max=259.000
episode=   7 | treward= 22.000 | max=259.000
episode=   8 | treward=  5.000 | max=259.000
episode=   9 | treward= 18.000 | max=259.000
episode=  10 | treward=  6.000 | max=259.000
episode=  11 | treward=182.000 | max=259.000
episode=  12 | treward= 44.000 | max=259.000
episode=  13 | treward=  6.000 | max=259.000
episode=  14 | treward=263.000 | max=263.000
episode=  15 | treward=  4.000 | max=263.000
episode=  16 | treward=  9.000 | max=263.000
episode=  17 | treward=  7.000 | max=263.000
episode=  18 | treward= 32.000 | max=263.000
episode=  19 | treward= 57.000 | max=263.000
episode=  20 | treward=  8.000 | max=263.000
episode=  21 | treward=271.000 | max=271.000
episode=  22 | treward=  7.000 | m

In [18]:
agent.test(5)

total reward= 499 | accuracy=0.499
total reward= 499 | accuracy=0.505
total reward= 499 | accuracy=0.483
total reward= 499 | accuracy=0.499
total reward= 499 | accuracy=0.517



In [19]:
class ActionSpace:
    n = 2
    def sample(self):
        return random.randint(0, 1)

In [20]:
class Trading:
    def __init__(self, symbol, features, window, lags,
                 start, end, periods,
                 x0=100, kappa=1, theta=100, sigma=0.2,
                 leverage=1, min_accuracy=0.5, min_performance=0.85,
                 mu=None, std=None,
                 new=True, normalize=True):
        self.symbol = symbol
        self.features = features
        self.n_features = len(features)
        self.window = window
        self.lags = lags
        self.start = start
        self.end = end
        self.periods = periods
        self.x0 = x0
        self.kappa = kappa
        self.theta = theta
        self.sigma = sigma
        self.leverage = leverage
        self.min_accuracy = min_accuracy
        self.min_performance = min_performance
        self.start = start
        self.end = end
        self.mu = mu
        self.std = std
        self.new = new
        self.normalize = normalize
        self.action_space = ActionSpace()
        self._simulate_data()
        self._prepare_data()

In [21]:
class Trading(Trading):
    def _simulate_data(self):
        index = pd.date_range(start=self.start,
                    end=self.end, periods=self.periods)
        s = [self.x0]
        dt = (index[-1] - index[0]).days / 365 / self.periods
        for t in range(1, len(index)):
            s_ = (s[t - 1] + self.kappa * (self.theta - s[t - 1]) * dt +
              s[t - 1] * self.sigma * math.sqrt(dt) *
                random.gauss(0, 1))
            s.append(s_)
        self.data = pd.DataFrame(s, columns=[self.symbol], index=index)

In [22]:
class Trading(Trading):
    def _prepare_data(self):
        self.data['r'] = np.log(self.data / self.data.shift(1))
        self.data.dropna(inplace=True)
        # additional features
        if self.window > 0:
            self.data['SMA'] = self.data[
                self.symbol].rolling(self.window).mean()
            self.data['DEL'] = self.data[
                self.symbol] - self.data['SMA']
            self.data['MIN'] = self.data[
                self.symbol].rolling(self.window).min()
            self.data['MAX'] = self.data[
                self.symbol].rolling(self.window).max()
            self.data['MOM'] = self.data['r'].rolling(
                self.window).mean()
            # add more features here
            self.data.dropna(inplace=True)
        if self.normalize:
            if self.mu is None or self.std is None:
                self.mu = self.data.mean()
                self.std = self.data.std()
            self.data_ = (self.data - self.mu) / self.std
        else:
            self.data_ = self.data.copy()
        self.data['d'] = np.where(self.data['r'] > 0, 1, 0)
        self.data['d'] = self.data['d'].astype(int)

In [23]:
class Trading(Trading):
    def _get_state(self):
        return self.data_[self.features].iloc[self.bar -
                                self.lags:self.bar]
    def seed(self, seed):
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
    def reset(self):
        if self.new:
            self._simulate_data()
            self._prepare_data()
        self.treward = 0
        self.accuracy = 0
        self.actions = list()
        self.returns = list()
        self.performance = 1
        self.bar = self.lags
        state = self._get_state()
        return state.values, {}

In [24]:
class Trading(Trading):
    def step(self, action):
        correct = action == self.data['d'].iloc[self.bar]
        ret = self.data['r'].iloc[self.bar] * self.leverage
        reward_ = 1 if correct else 0
        pl = abs(ret) if correct else -abs(ret)
        reward = reward_
        # alternative options:
        # reward = pl  # only the P&L in log returns
        # reward = reward_ + 10 * pl  # the reward + the scaled P&L
        self.treward += reward
        self.bar += 1
        self.accuracy = self.treward / (self.bar - self.lags) 
        self.performance *= math.exp(pl)
        if self.bar >= len(self.data):
            done = True
        elif reward_ == 1:
            done = False
        elif (self.accuracy < self.min_accuracy and
              self.bar > self.lags + 15):
            done = True
        elif (self.performance < self.min_performance and
              self.bar > self.lags + 15):
            done = True
        else:
            done = False
        state = self._get_state()
        return state.values, reward, done, False, {}

In [25]:
symbol = 'SYMBOL'

In [26]:
trading = Trading(symbol, [symbol, 'r', 'DEL'], window=10, lags=5,
            start='2024-1-1', end='2026-1-1', periods=504,
            x0=100, kappa=2, theta=300, sigma=0.1, normalize=False)

In [27]:
random.seed(750)

In [28]:
trading.reset()

(array([[115.90591443,   0.01926915,   6.89239862],
        [117.17850569,   0.01091968,   6.5901155 ],
        [118.79489427,   0.01369997,   6.65876779],
        [120.63380354,   0.01536111,   6.92684742],
        [121.81132396,   0.00971378,   6.65768164]]),
 {})

In [29]:
trading.data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 494 entries, 2024-01-15 12:47:14.194831014 to 2026-01-01 00:00:00
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   SYMBOL  494 non-null    float64
 1   r       494 non-null    float64
 2   SMA     494 non-null    float64
 3   DEL     494 non-null    float64
 4   MIN     494 non-null    float64
 5   MAX     494 non-null    float64
 6   MOM     494 non-null    float64
 7   d       494 non-null    int32  
dtypes: float64(7), int32(1)
memory usage: 32.8 KB


In [30]:
#trading.data.iloc[-200:][[trading.symbol, 'SMA', 'MIN', 'MAX']].plot(style=['b-', 'r--', 'g:', 'g:'], lw=1.0);

In [31]:
random.seed(100)
np.random.seed(100)
torch.manual_seed(100)

<torch._C.Generator at 0x297b9f99390>

In [32]:
trading = Trading(symbol, ['r', 'DEL', 'MOM'], window=10, lags=8,
            start='2024-1-1', end='2026-1-1', periods=2 * 252,
            x0=100, kappa=2, theta=50, sigma=0.1,
            leverage=1, min_accuracy=0.5, min_performance=0.85,
            new=True, normalize=True)

In [33]:
from tensorflow.keras.optimizers import Adam
from tensorflow.python.keras.layers import Dense, Flatten
from keras import Sequential
import os
import random
import warnings
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import deque

warnings.simplefilter('ignore')
os.environ['PYTHONHASHSEED'] = '0'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class QNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hu=24):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, hu)
        self.fc2 = nn.Linear(hu, hu)
        self.fc3 = nn.Linear(hu, action_dim)
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

class DQLAgent:
    def __init__(self, symbol, feature, n_features, env, hu=24, lr=0.001):
        self.epsilon = 1.0
        self.epsilon_decay = 0.9975
        self.epsilon_min = 0.1
        self.memory = deque(maxlen=2000)
        self.batch_size = 32
        self.gamma = 0.5
        self.trewards = []
        self.max_treward = -np.inf
        self.n_features = n_features
        self.env = env
        self.episodes = 0
        # Q-Network and optimizer
        self.model = QNetwork(self.n_features, self.env.action_space.n, hu).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.criterion = nn.MSELoss()
        self.symbol = symbol
        self.feature = feature
        self.n_features = n_features

        # Si env est un entier, nous devons l'adapter
        if isinstance(env, int):
            # Créer une structure simple pour simuler l'environnement
            class SimpleEnv:
                def __init__(self, n_actions):
                    self.action_space = type('obj', (object,), {'n': n_actions})

            self.env = SimpleEnv(env)  # env représente le nombre d'actions
        else:
            self.env = env  # env est déjà un environnement approprié

        self.hu = hu
        self.lr = lr

        # Reste du code d'initialisation...
        self.episodes = 0
        # Q-Network and optimizer
        self.model = QNetwork(self.n_features, self.env.action_space.n, hu).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.criterion = nn.MSELoss()



    def _reshape(self, state):
        state = state.flatten()
        return np.reshape(state, [1, len(state)])

    def act(self, state):
        if random.random() < self.epsilon:
            return self.env.action_space.sample()
        state_tensor = torch.FloatTensor(state).to(device)
        if state_tensor.dim() == 1:
            state_tensor = state_tensor.unsqueeze(0)
        with torch.no_grad():
            q_values = self.model(state_tensor)
        return int(torch.argmax(q_values[0]).item())

    def replay(self):
        if len(self.memory) < self.batch_size:
            return
        batch = random.sample(self.memory, self.batch_size)
        states = np.vstack([e[0] for e in batch])
        actions = np.array([e[1] for e in batch])
        next_states = np.vstack([e[2] for e in batch])
        rewards = np.array([e[3] for e in batch], dtype=np.float32)
        dones = np.array([e[4] for e in batch], dtype=bool)

        states_tensor = torch.FloatTensor(states).to(device)
        next_states_tensor = torch.FloatTensor(next_states).to(device)
        actions_tensor = torch.LongTensor(actions).unsqueeze(1).to(device)
        rewards_tensor = torch.FloatTensor(rewards).to(device)
        dones_tensor = torch.BoolTensor(dones).to(device)

        current_q = self.model(states_tensor).gather(1, actions_tensor).squeeze(1)
        next_q = self.model(next_states_tensor).max(1)[0]
        target_q = rewards_tensor + self.gamma * next_q * (~dones_tensor).float()

        loss = self.criterion(current_q, target_q.detach())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def learn(self, episodes):
        for e in range(1, episodes + 1):
            self.episodes += 1
            state, _ = self.env.reset()
            state = self._reshape(state)
            treward = 0
            for f in range(1, 5000):
                self.f = f
                action = self.act(state)
                next_state, reward, done, trunc, _ = self.env.step(action)
                treward += reward
                next_state = self._reshape(next_state)
                self.memory.append((state, action, next_state, reward, done))
                state = next_state
                if done:
                    self.trewards.append(treward)
                    self.max_treward = max(self.max_treward, treward)
                    templ = f'episode={self.episodes:4d} | '
                    templ += f'treward={treward:7.3f} | max={self.max_treward:7.3f}'
                    print(templ, end='\r')
                    break
            if len(self.memory) > self.batch_size:
                self.replay()
            print()

    def test(self, episodes, min_accuracy=0.0, min_performance=0.0, verbose=True, full=True):
        # Backup and set environment thresholds
        ma = getattr(self.env, 'min_accuracy', None)
        if hasattr(self.env, 'min_accuracy'):
            self.env.min_accuracy = min_accuracy
        mp = None
        if hasattr(self.env, 'min_performance'):
            mp = self.env.min_performance
            self.env.min_performance = min_performance
            self.performances = []
        for e in range(1, episodes + 1):
            state, _ = self.env.reset()
            state = self._reshape(state)
            for f in range(1, 5001):
                action = self.act(state)
                state, reward, done, trunc, _ = self.env.step(action)
                state = self._reshape(state)
                if done:
                    templ = f'total reward={f:4d} | accuracy={self.env.accuracy:.3f}'
                    if hasattr(self.env, 'min_performance'):
                        self.performances.append(self.env.performance)
                        templ += f' | performance={self.env.performance:.3f}'
                    if verbose:
                        if full:
                            print(templ)
                        else:
                            print(templ, end='\r')
                    break
        # Restore environment thresholds
        if hasattr(self.env, 'min_accuracy') and ma is not None:
            self.env.min_accuracy = ma
        if mp is not None:
            self.env.min_performance = mp
        print()



class TradingAgent(DQLAgent):
    def _create_model(self, hu, lr):
        self.model = Sequential()
        self.model.add(Dense(hu, input_dim=
            self.env.lags * self.env.n_features,
                        activation='relu'))
        self.model.add(Flatten())
        self.model.add(Dense(hu, activation='relu'))
        self.model.add(Dense(2, activation='linear'))
        self.model.compile(loss='mse',
            optimizer=Adam(learning_rate=lr))

tradingagent = TradingAgent(trading.symbol, trading.features,
                 trading.lags * trading.n_features, trading, hu=24, lr=0.0001)

In [34]:
%%time
tradingagent.test(100, min_accuracy=0.0, min_performance=0.0, verbose=True, full=False)

total reward= 486 | accuracy=0.481 | performance=0.952
CPU times: total: 23.8 s
Wall time: 32.1 s


In [35]:
random_performances = tradingagent.performances

In [36]:
sum(random_performances) / len(random_performances)

0.993043981488391

In [37]:
#plt.hist(random_performances, bins=50, color='b')
#plt.xlabel('gross performance')
#plt.ylabel('frequency');

In [38]:
%time tradingagent.learn(500)

episode=   1 | treward= 11.000 | max= 11.000
episode=   2 | treward=121.000 | max=121.000
episode=   3 | treward=  6.000 | max=121.000
episode=   4 | treward=  9.000 | max=121.000
episode=   5 | treward=  7.000 | max=121.000
episode=   6 | treward=  5.000 | max=121.000
episode=   7 | treward=  4.000 | max=121.000
episode=   8 | treward= 11.000 | max=121.000
episode=   9 | treward= 29.000 | max=121.000
episode=  10 | treward=  8.000 | max=121.000
episode=  11 | treward= 17.000 | max=121.000
episode=  12 | treward= 11.000 | max=121.000
episode=  13 | treward= 31.000 | max=121.000
episode=  14 | treward=  8.000 | max=121.000
episode=  15 | treward= 11.000 | max=121.000
episode=  16 | treward= 14.000 | max=121.000
episode=  17 | treward=  7.000 | max=121.000
episode=  18 | treward=  7.000 | max=121.000
episode=  19 | treward=256.000 | max=256.000
episode=  20 | treward=  5.000 | max=256.000
episode=  21 | treward= 13.000 | max=256.000
episode=  22 | treward= 18.000 | m

In [39]:
%%time
tradingagent.test(50, min_accuracy=0.0,
           min_performance=0.0,
           verbose=True, full=False)

total reward= 486 | accuracy=0.547 | performance=1.678
CPU times: total: 9.56 s
Wall time: 9.89 s


In [40]:
sum(tradingagent.performances) / len(tradingagent.performances)

1.321637016492338

In [41]:
#plt.hist(random_performances, bins=30, color='b', label='random (left)')
#plt.hist(tradingagent.performances, bins=30, color='r', label='trained (right)')
#plt.xlabel('gross performance')
#plt.ylabel('frequency')
#plt.legend();

<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="https://tpq.io" target="_blank">https://tpq.io</a> | <a href="https://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>