<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# Reinforcement Learning for Finance

**Chapter 02 &mdash; Deep Q-Learning**

&copy; Dr. Yves J. Hilpisch

<a href="https://tpq.io" target="_blank">https://tpq.io</a> | <a href="https://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>

## CartPole

### The Game Environment 

In [1]:
import gymnasium as gym

In [2]:
env = gym.make('CartPole-v1')

In [3]:
env.action_space

Discrete(2)

In [4]:
env.action_space.n

2

In [5]:
[env.action_space.sample() for _ in range(10)]

[0, 1, 1, 0, 1, 1, 1, 1, 0, 0]

In [6]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [7]:
env.observation_space.shape

(4,)

In [8]:
env.reset(seed=100)
# cart position, cart velocity, pole angle, pole angular velocity

(array([ 0.03349816,  0.0096554 , -0.02111368, -0.04570484], dtype=float32),
 {})

In [9]:
env.step(0)

(array([ 0.03369127, -0.18515752, -0.02202777,  0.24024247], dtype=float32),
 1.0,
 False,
 False,
 {})

In [10]:
env.step(1)

(array([ 0.02998812,  0.01027205, -0.01722292, -0.05930644], dtype=float32),
 1.0,
 False,
 False,
 {})

In [11]:
class RandomAgent:
    def __init__(self):
        self.env = gym.make('CartPole-v1')
    def play(self, episodes=1):
        self.trewards = list()
        for e in range(episodes):
            self.env.reset()
            for step in range(1, 100):
                a = self.env.action_space.sample()
                state, reward, done, trunc, info = self.env.step(a)
                if done:
                    self.trewards.append(step)
                    break

In [12]:
ra = RandomAgent()

In [13]:
import torch
torch.__version__

'2.7.0+cpu'

In [14]:
ra.play(15)

In [15]:
ra.trewards

[30, 13, 35, 36, 24, 29, 16, 12, 17, 16, 23, 11, 36, 16, 9]

In [16]:
round(sum(ra.trewards) / len(ra.trewards), 2)

21.53

In [17]:
import os
import random
import warnings
import numpy as np
from torch import optim
import torch.nn as nn
from collections import deque

In [18]:
import warnings
import os
import torch  # Assurez-vous d'importer torch

# Configuration de la reproductibilité
warnings.simplefilter('ignore')
os.environ['PYTHONHASHSEED'] = '0'

# Vérifier si CUDA est disponible avant de configurer cudnn
if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
else:
    print("CUDA n'est pas disponible, les paramètres cudnn ne sont pas appliqués")


CUDA n'est pas disponible, les paramètres cudnn ne sont pas appliqués


In [19]:
lr = 0.005

In [20]:
random.seed(100)
np.random.seed(100)
torch.manual_seed(100)

<torch._C.Generator at 0x1fb84bcb6f0>

In [21]:
class DQLAgent:
    def __init__(self):
        self.epsilon = 1.0
        self.epsilon_decay = 0.9975
        self.epsilon_min = 0.1
        self.memory = deque(maxlen=2000)
        self.batch_size = 32
        self.gamma = 0.9
        self.trewards = []
        self.max_treward = 0
        self.env = gym.make('CartPole-v1')
        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = self._create_model().to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.criterion = nn.MSELoss()
    def _create_model(self):
        model = nn.Sequential(
            nn.Linear(self.state_size, 24),
            nn.ReLU(),
            nn.Linear(24, 24),
            nn.ReLU(),
            nn.Linear(24, self.action_size)
        )
        return model

In [22]:
class DQLAgent(DQLAgent):
    def act(self, state):
        if random.random() < self.epsilon:
            return self.env.action_space.sample()
        state = torch.FloatTensor(state).to(self.device)
        with torch.no_grad():
            q_values = self.model(state)
        return torch.argmax(q_values).item()
    def replay(self):
        if len(self.memory) < self.batch_size:
            return
        batch = random.sample(self.memory, self.batch_size)
        states, actions, next_states, rewards, dones = zip(*batch)
        states = torch.FloatTensor(states).to(self.device).squeeze(1)
        next_states = torch.FloatTensor(next_states).to(self.device).squeeze(1)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        q_values = self.model(states)
        q_value = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
        next_q_values = self.model(next_states).max(1)[0]
        expected_q_value = rewards + self.gamma * next_q_values * (1 - dones)
        loss = self.criterion(q_value, expected_q_value.detach())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [23]:
class DQLAgent(DQLAgent):
    def learn(self, episodes):
        for e in range(1, episodes + 1):
            state, _ = self.env.reset()
            state = np.reshape(state, [1, self.state_size])
            for f in range(1, 5000):
                action = self.act(state)
                next_state, reward, done, trunc, _ = self.env.step(action)
                next_state = np.reshape(next_state, [1, self.state_size])
                self.memory.append([state, action, next_state, reward, done])
                state = next_state
                if done or trunc:
                    self.trewards.append(f)
                    self.max_treward = max(self.max_treward, f)
                    templ = f'episode={e:4d} | treward={f:4d}'
                    templ += f' | max={self.max_treward:4d}'
                    print(templ, end='\r')
                    break
            if len(self.memory) > self.batch_size:
                self.replay()
        print()

In [24]:
class DQLAgent(DQLAgent):
    def test(self, episodes):
        for e in range(1, episodes + 1):
            state, _ = self.env.reset()
            state = np.reshape(state, [1, self.state_size])
            for f in range(1, 5001):
                state_tensor = torch.FloatTensor(state).to(self.device)
                with torch.no_grad():
                    q_values = self.model(state_tensor)
                action = torch.argmax(q_values).item()
                state, reward, done, trunc, _ = self.env.step(action)
                state = np.reshape(state, [1, self.state_size])
                if done or trunc:
                    print(f, end=' ')
                    break

In [25]:
from dqlagent_pytorch2 import DQLAgent2

agent = DQLAgent2()

CUDA n'est pas disponible, les paramètres cudnn ne sont pas appliqués


In [26]:
%time agent.learn(2500)

episode=2500 | treward= 185 | max= 500
CPU times: total: 45.8 s
Wall time: 58.4 s


In [27]:
agent.epsilon

0.09997053357470892

In [28]:
agent.test(15)

<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="https://tpq.io" target="_blank">https://tpq.io</a> | <a href="https://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>