<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# Reinforcement Learning for Finance

**Chapter 03 &mdash; Financial Q-Learning**

&copy; Dr. Yves J. Hilpisch

<a href="https://tpq.io" target="_blank">https://tpq.io</a> | <a href="https://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>

## Finance Environment

In [1]:
import os
import random

In [2]:
random.seed(100)
os.environ['PYTHONHASHSEED'] = '0'

In [3]:
class ActionSpace:
    def sample(self):
        return random.randint(0, 1)

In [4]:
action_space = ActionSpace()

In [5]:
[action_space.sample() for _ in range(10)]

[0, 1, 1, 0, 1, 1, 1, 0, 0, 0]

In [6]:
import numpy as np
import pandas as pd

In [7]:
class Finance:
    url = 'https://certificate.tpq.io/rl4finance.csv'
    def __init__(self, symbol, feature,
                 min_accuracy=0.485, n_features=4):
        self.symbol = symbol
        self.feature = feature
        self.n_features = n_features
        self.action_space = ActionSpace()
        self.min_accuracy = min_accuracy
        self._get_data()
        self._prepare_data()
    def _get_data(self):
        self.raw = pd.read_csv(self.url,
                index_col=0, parse_dates=True)

In [8]:
class Finance(Finance):
    def _prepare_data(self):
        self.data = pd.DataFrame(self.raw[self.symbol]).dropna()
        self.data['r'] = np.log(self.data / self.data.shift(1))
        self.data['d'] = np.where(self.data['r'] > 0, 1, 0)
        self.data.dropna(inplace=True)
        self.data_ = (self.data - self.data.mean()) / self.data.std()
    def reset(self):
        self.bar = self.n_features
        self.treward = 0
        state = self.data_[self.feature].iloc[
            self.bar - self.n_features:self.bar].values
        return state, {}

In [9]:
class Finance(Finance):
    def step(self, action):
        if action == self.data['d'].iloc[self.bar]:
            correct = True
        else:
            correct = False
        reward = 1 if correct else 0
        self.treward += reward
        self.bar += 1
        self.accuracy = self.treward / (self.bar - self.n_features)
        if self.bar >= len(self.data):
            done = True
        elif reward == 1:
            done = False
        elif (self.accuracy < self.min_accuracy) and (self.bar > 15):
            done = True
        else:
            done = False
        next_state = self.data_[self.feature].iloc[
            self.bar - self.n_features:self.bar].values
        return next_state, reward, done, False, {}

In [10]:
fin = Finance(symbol='EUR=', feature='EUR=')

In [11]:
list(fin.raw.columns)

['AAPL.O',
 'MSFT.O',
 'INTC.O',
 'AMZN.O',
 'GS.N',
 '.SPX',
 '.VIX',
 'SPY',
 'EUR=',
 'XAU=',
 'GDX',
 'GLD']

In [12]:
fin.reset()
# four lagged, normalized price points

(array([2.74844931, 2.64643904, 2.69560062, 2.68085214]), {})

In [13]:
fin.action_space.sample()

1

In [14]:
fin.step(fin.action_space.sample())

(array([2.64643904, 2.69560062, 2.68085214, 2.63046153]), 0, False, False, {})

In [15]:
fin = Finance('EUR=', 'r')

In [16]:
fin.reset()
# four lagged, normalized log returns

(array([-1.19130476, -1.21344494,  0.61099805, -0.16094865]), {})

In [17]:
class RandomAgent:
    def __init__(self):
        self.env = Finance('EUR=', 'r')
    def play(self, episodes=1):
        self.trewards = list()
        for e in range(episodes):
            self.env.reset()
            for step in range(1, 100):
                a = self.env.action_space.sample()
                state, reward, done, trunc, info = self.env.step(a)
                if done:
                    self.trewards.append(step)
                    break

In [18]:
ra = RandomAgent()

In [19]:
ra.play(15)

In [20]:
ra.trewards

[17, 13, 17, 12, 12, 12, 13, 23, 31, 13, 12, 15]

In [21]:
round(sum(ra.trewards) / len(ra.trewards), 2)

15.83

In [22]:
len(fin.data)

2607

## DQL Agent

In [23]:
import os
import random
import warnings
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque

In [24]:
warnings.simplefilter('ignore')

In [25]:
lr = 0.0001

In [26]:
class DQLAgent:
    def __init__(self, symbol, feature, min_accuracy, n_features=4):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.epsilon = 1.0
        self.epsilon_decay = 0.9975
        self.epsilon_min = 0.1
        self.memory = deque(maxlen=2000)
        self.batch_size = 32
        self.gamma = 0.5
        self.trewards = []
        self.max_treward = 0
        self.n_features = n_features
        # Define neural network
        class Net(nn.Module):
            def __init__(self, input_dim, output_dim):
                super(Net, self).__init__()
                self.fc1 = nn.Linear(input_dim, 24)
                self.fc2 = nn.Linear(24, 24)
                self.out = nn.Linear(24, output_dim)
            def forward(self, x):
                x = torch.relu(self.fc1(x))
                x = torch.relu(self.fc2(x))
                return self.out(x)
        self.model = Net(n_features, 2).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.loss_fn = nn.MSELoss()
        self.env = Finance(symbol, feature, min_accuracy, n_features)
    def act(self, state):
        if random.random() < self.epsilon:
            return self.env.action_space.sample()
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_values = self.model(state)
        return int(torch.argmax(q_values[0]).item())
    def replay(self):
        batch = random.sample(self.memory, self.batch_size)
        for state, action, next_state, reward, done in batch:
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0).to(self.device)
            with torch.no_grad():
                target_q = reward + (0 if done else self.gamma * torch.max(self.model(next_state_tensor)[0]).item())
            current_q = self.model(state_tensor)[0, action]
            loss = self.loss_fn(current_q, torch.tensor(target_q, dtype=torch.float, device=self.device))
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    def learn(self, episodes):
        for e in range(1, episodes + 1):
            state, _ = self.env.reset()
            for f in range(1, 5000):
                action = self.act(state)
                next_state, reward, done, trunc, _ = self.env.step(action)
                self.memory.append((state, action, next_state, reward, done))
                state = next_state
                if done:
                    self.trewards.append(f)
                    self.max_treward = max(self.max_treward, f)
                    print(f'episode={e:4d} | treward={f:4d} | max={self.max_treward:4d}', end='\r')
                    break
            if len(self.memory) > self.batch_size:
                self.replay()
        print()
    def test(self, episodes):
        ma = self.env.min_accuracy
        self.env.min_accuracy = 0.5
        for e in range(1, episodes + 1):
            state, _ = self.env.reset()
            for f in range(1, 5001):
                state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
                with torch.no_grad():
                    action = int(torch.argmax(self.model(state_tensor)[0]).item())
                state, reward, done, trunc, _ = self.env.step(action)
                if done:
                    print(f'total reward={f} | accuracy={self.env.accuracy:.3f}')
                    break
        self.env.min_accuracy = ma

In [27]:
random.seed(250)
np.random.seed(250)
torch.manual_seed(250)

<torch._C.Generator at 0x12cb4cb4b70>

In [28]:
agent = DQLAgent('EUR=', 'r', 0.495, 4)

In [29]:
%time agent.learn(250)

episode= 250 | treward=  12 | max=2603
CPU times: total: 10.2 s
Wall time: 10.4 s


In [30]:
agent.test(5)

total reward=2603 | accuracy=0.516
total reward=2603 | accuracy=0.516
total reward=2603 | accuracy=0.516
total reward=2603 | accuracy=0.516
total reward=2603 | accuracy=0.516


<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="https://tpq.io" target="_blank">https://tpq.io</a> | <a href="https://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>