<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# Reinforcement Learning for Finance

**Chapter 08 &mdash; Dynamic Asset Allocation (Three Asset Case)**

&copy; Dr. Yves J. Hilpisch

<a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>

In [1]:
%run assetallocation_pytorch.py

In [2]:
days = 2 * 252

In [3]:
import random
days = 2 * 252
random.seed(100)

In [4]:
import warnings
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import deque
import os
import math
import pandas as pd
from scipy import stats
from pylab import plt, mpl
from scipy.optimize import minimize
import torch
from dqlagent_pytorch import *
import random



days = 2 * 252
random.seed(100)

plt.style.use('seaborn-v0_8')
mpl.rcParams['figure.dpi'] = 300
mpl.rcParams['savefig.dpi'] = 300
mpl.rcParams['font.family'] = 'serif'
np.set_printoptions(suppress=True)


class observation_space:
    def __init__(self, n):
        self.shape = (n,)


class action_space:
    def __init__(self, n):
        self.n = n
    def seed(self, seed):
        random.seed(seed)
    def sample(self):
        rn = np.random.random(3)
        return rn / rn.sum()


class Investing:
    def __init__(self, asset_one, asset_two, asset_three, steps=252, amount=1):
        self.asset_one = asset_one
        self.asset_two = asset_two
        self.asset_three = asset_three
        self.steps = steps
        self.initial_balance = amount
        self.portfolio_value = amount
        self.portfolio_value_new = amount
        self.observation_space = observation_space(4)
        self.osn = self.observation_space.shape[0]
        self.action_space = action_space(3)
        self.retrieved = 0
        self._generate_data()
        self.portfolios = pd.DataFrame()
        self.episode = 0

    def _generate_data(self):
        if self.retrieved:
            pass
        else:
            url = 'https://certificate.tpq.io/rl4finance.csv'
            self.raw = pd.read_csv(url, index_col=0, parse_dates=True).dropna()
            self.retrieved
        self.data = pd.DataFrame()
        self.data['X'] = self.raw[self.asset_one]
        self.data['Y'] = self.raw[self.asset_two]
        self.data['Z'] = self.raw[self.asset_three]
        s = random.randint(self.steps, len(self.data))
        self.data = self.data.iloc[s-self.steps:s]
        self.data = self.data / self.data.iloc[0]

    def _get_state(self):
        Xt = self.data['X'].iloc[self.bar]
        Yt = self.data['Y'].iloc[self.bar]
        Zt = self.data['Z'].iloc[self.bar]
        date = self.data.index[self.bar]
        return np.array(
            [Xt, Yt, Zt, self.xt, self.yt, self.zt]
            ), {'date': date}

    def seed(self, seed=None):
        if seed is not None:
            random.seed(seed)

    def reset(self):
        self.xt = 0
        self.yt = 0
        self.zt = 0
        self.bar = 0
        self.treward = 0
        self.portfolio_value = self.initial_balance
        self.portfolio_value_new = self.initial_balance
        self.episode += 1
        self._generate_data()
        self.state, info = self._get_state()
        return self.state, info

    def add_results(self, pl):
        df = pd.DataFrame({
                   'e': self.episode, 'date': self.date,
                   'xt': self.xt, 'yt': self.yt, 'zt': self.zt,
                   'pv': self.portfolio_value,
                   'pv_new': self.portfolio_value_new, 'p&l[$]': pl,
                   'p&l[%]': pl / self.portfolio_value_new * 100,
                   'Xt': self.state[0], 'Yt': self.state[1],
                   'Zt': self.state[2], 'Xt_new': self.new_state[0],
                   'Yt_new': self.new_state[1],
                   'Zt_new': self.new_state[2],
                          }, index=[0])
        self.portfolios = pd.concat((self.portfolios, df), ignore_index=True)

    def step(self, action):
        self.bar += 1
        self.new_state, info = self._get_state()
        self.date = info['date']
        if self.bar == 1:
            self.xt = action[0]
            self.yt = action[1]
            self.zt = action[2]
            pl = 0.
            reward = 0.
            self.add_results(pl)
        else:
            self.portfolio_value_new = (
                self.xt * self.portfolio_value *
                    self.new_state[0] / self.state[0] +
                self.yt * self.portfolio_value *
                    self.new_state[1] / self.state[1] +
                self.zt * self.portfolio_value *
                    self.new_state[2] / self.state[2]
            )
            pl = self.portfolio_value_new - self.portfolio_value
            self.xt = action[0]
            self.yt = action[1]
            self.zt = action[2]
            self.add_results(pl)
            ret = self.portfolios['p&l[%]'].iloc[-1] / 100 * 252
            vol = self.portfolios['p&l[%]'].rolling(
                20, min_periods=1).std().iloc[-1] * math.sqrt(252)
            sharpe = ret / vol
            reward = sharpe
            self.portfolio_value = self.portfolio_value_new
        if self.bar == len(self.data) - 1:
            done = True
        else:
            done = False
        self.state = self.new_state
        return self.state, reward, done, False, {}


class InvestingAgent(DQLAgent):

    def __init__(self, symbol, feature, n_features, env, hu=24, lr=0.001):
        super().__init__(symbol, feature, n_features, env, hu, lr)
        # Continuous action: override model to output scalar Q-value
        self.model = QNetwork(self.n_features, 1, hu).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.criterion = nn.MSELoss()

    def opt_action(self, state):
        bnds = 3 * [(0, 1)]  # three weights
        cons = [{'type': 'eq', 'fun': lambda x: x.sum() - 1}]
        def f_obj(x):
            s = state.copy()
            s[0, 3] = x[0]
            s[0, 4] = x[1]
            s[0, 5] = x[2]
            pen = np.mean((state[0, 3:] - x) ** 2)
            s_tensor = torch.FloatTensor(s).to(device)
            with torch.no_grad():
                q_val = self.model(s_tensor)
            return q_val.cpu().numpy()[0, 0] - pen
        try:
            state = self._reshape(state)
            res = minimize(lambda x: -f_obj(x), 3 * [1 / 3],
                           bounds=bnds, constraints=cons,
                           options={'eps': 1e-4}, method='SLSQP')
            action = res['x']
        except Exception:
            action = self.env.action_space.sample()
        return action

    def act(self, state):
        if random.random() <= self.epsilon:
            return self.env.action_space.sample()
        return self.opt_action(state)

    def replay(self):
        if len(self.memory) < self.batch_size:
            return
        batch = random.sample(self.memory, self.batch_size)
        for state, action, next_state, reward, done in batch:
            target = torch.tensor([reward], dtype=torch.float32).to(device)
            if not done:
                ns = next_state.copy()
                action_cont = self.opt_action(ns)
                ns[0, 3:] = action_cont
                ns_tensor = torch.FloatTensor(ns).to(device)
                with torch.no_grad():
                    future_q = self.model(ns_tensor)[0, 0]
                target = target + self.gamma * future_q
            state_tensor = torch.FloatTensor(state).to(device)
            self.optimizer.zero_grad()
            current_q = self.model(state_tensor)[0, 0]
            loss = self.criterion(current_q, target)
            loss.backward()
            self.optimizer.step()
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def test(self, episodes, verbose=True):
        for e in range(1, episodes + 1):
            state, _ = self.env.reset()
            state = self._reshape(state)
            treward = 0
            for _ in range(1, len(self.env.data) + 1):
                action = self.opt_action(state)
                state, reward, done, trunc, _ = self.env.step(action)
                state = self._reshape(state)
                treward += reward
                if done:
                    templ = f'episode={e} | total reward={treward:4.2f}'
                    if verbose:
                        print(templ, end='\r')
                    break
        print()






warnings.simplefilter('ignore')
os.environ['PYTHONHASHSEED'] = '0'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class QNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hu=24):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, hu)
        self.fc2 = nn.Linear(hu, hu)
        self.fc3 = nn.Linear(hu, action_dim)
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

class DQLAgent:
    def __init__(self, symbol, feature, n_features, env, hu=24, lr=0.001):
        self.epsilon = 1.0
        self.epsilon_decay = 0.9975
        self.epsilon_min = 0.1
        self.memory = deque(maxlen=2000)
        self.batch_size = 32
        self.gamma = 0.5
        self.trewards = []
        self.max_treward = -np.inf
        self.n_features = n_features
        self.env = env
        self.episodes = 0
        # Q-Network and optimizer
        self.model = QNetwork(self.n_features, self.env.action_space.n, hu).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.criterion = nn.MSELoss()
        self.symbol = symbol
        self.feature = feature
        self.n_features = n_features

        # Si env est un entier, nous devons l'adapter
        if isinstance(env, int):
            # Créer une structure simple pour simuler l'environnement
            class SimpleEnv:
                def __init__(self, n_actions):
                    self.action_space = type('obj', (object,), {'n': n_actions})

            self.env = SimpleEnv(env)  # env représente le nombre d'actions
        else:
            self.env = env  # env est déjà un environnement approprié

        self.hu = hu
        self.lr = lr

        # Reste du code d'initialisation...
        self.episodes = 0
        # Q-Network and optimizer
        self.model = QNetwork(self.n_features, self.env.action_space.n, hu).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.criterion = nn.MSELoss()



    def _reshape(self, state):
        state = state.flatten()
        return np.reshape(state, [1, len(state)])

    def act(self, state):
        if random.random() < self.epsilon:
            return self.env.action_space.sample()
        state_tensor = torch.FloatTensor(state).to(device)
        if state_tensor.dim() == 1:
            state_tensor = state_tensor.unsqueeze(0)
        with torch.no_grad():
            q_values = self.model(state_tensor)
        return int(torch.argmax(q_values[0]).item())

    def replay(self):
        if len(self.memory) < self.batch_size:
            return
        batch = random.sample(self.memory, self.batch_size)
        states = np.vstack([e[0] for e in batch])
        actions = np.array([e[1] for e in batch])
        next_states = np.vstack([e[2] for e in batch])
        rewards = np.array([e[3] for e in batch], dtype=np.float32)
        dones = np.array([e[4] for e in batch], dtype=bool)

        states_tensor = torch.FloatTensor(states).to(device)
        next_states_tensor = torch.FloatTensor(next_states).to(device)
        actions_tensor = torch.LongTensor(actions).unsqueeze(1).to(device)
        rewards_tensor = torch.FloatTensor(rewards).to(device)
        dones_tensor = torch.BoolTensor(dones).to(device)

        current_q = self.model(states_tensor).gather(1, actions_tensor).squeeze(1)
        next_q = self.model(next_states_tensor).max(1)[0]
        target_q = rewards_tensor + self.gamma * next_q * (~dones_tensor).float()

        loss = self.criterion(current_q, target_q.detach())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def learn(self, episodes):
        for e in range(1, episodes + 1):
            self.episodes += 1
            state, _ = self.env.reset()
            state = self._reshape(state)
            treward = 0
            for f in range(1, 5000):
                self.f = f
                action = self.act(state)
                next_state, reward, done, trunc, _ = self.env.step(action)
                treward += reward
                next_state = self._reshape(next_state)
                self.memory.append((state, action, next_state, reward, done))
                state = next_state
                if done:
                    self.trewards.append(treward)
                    self.max_treward = max(self.max_treward, treward)
                    templ = f'episode={self.episodes:4d} | '
                    templ += f'treward={treward:7.3f} | max={self.max_treward:7.3f}'
                    print(templ, end='\r')
                    break
            if len(self.memory) > self.batch_size:
                self.replay()
            print()

    def test(self, episodes, min_accuracy=0.0, min_performance=0.0, verbose=True, full=True):
        # Backup and set environment thresholds
        ma = getattr(self.env, 'min_accuracy', None)
        if hasattr(self.env, 'min_accuracy'):
            self.env.min_accuracy = min_accuracy
        mp = None
        if hasattr(self.env, 'min_performance'):
            mp = self.env.min_performance
            self.env.min_performance = min_performance
            self.performances = []
        for e in range(1, episodes + 1):
            state, _ = self.env.reset()
            state = self._reshape(state)
            for f in range(1, 5001):
                action = self.act(state)
                state, reward, done, trunc, _ = self.env.step(action)
                state = self._reshape(state)
                if done:
                    templ = f'total reward={f:4d} | accuracy={self.env.accuracy:.3f}'
                    if hasattr(self.env, 'min_performance'):
                        self.performances.append(self.env.performance)
                        templ += f' | performance={self.env.performance:.3f}'
                    if verbose:
                        if full:
                            print(templ)
                        else:
                            print(templ, end='\r')
                    break
        # Restore environment thresholds
        if hasattr(self.env, 'min_accuracy') and ma is not None:
            self.env.min_accuracy = ma
        if mp is not None:
            self.env.min_performance = mp
        print()



# 1 = X, 2 = Y, 3 = Z
investing = Investing('.SPX', '.VIX', 'XAU=', steps=days)
#investing.data.plot(lw=1, style=['g--', 'b:', 'm-.'])
#plt.ylabel('price');

In [5]:
random.seed(100)
np.random.seed(100)
torch.manual_seed(100)
torch.cuda.manual_seed_all(100)

In [6]:
agent = InvestingAgent('3AC', feature=None, n_features=6, env=investing, hu=128, lr=0.00025)

In [7]:
episodes = 10

In [8]:
%time agent.learn(episodes)

episode=   1 | treward=  2.294 | max=  2.294
episode=   2 | treward=  2.723 | max=  2.723
episode=   3 | treward=  1.509 | max=  2.723
episode=   4 | treward=  5.431 | max=  5.431
episode=   5 | treward= -3.476 | max=  5.431
episode=   6 | treward=  4.838 | max=  5.431
episode=   7 | treward= -0.033 | max=  5.431
episode=   8 | treward=  3.942 | max=  5.431
episode=   9 | treward=  2.227 | max=  5.431
episode=  10 | treward=  3.377 | max=  5.431
CPU times: total: 5.02 s
Wall time: 7.31 s


In [9]:
agent.epsilon

0.9752793831785673

In [10]:
agent.env.portfolios = pd.DataFrame()

In [11]:
%time agent.test(10)

episode=10 | total reward=-1.42
CPU times: total: 17.5 s
Wall time: 19.9 s


In [12]:
agent.env.portfolios.groupby('e')[
    ['xt', 'yt', 'zt']].mean().mean()

xt    0.947857
yt    0.016547
zt    0.035595
dtype: float64

In [13]:
agent.env.portfolios.groupby('e')[
    ['Xt', 'Yt', 'Zt', 'pv']].last().mean()

Xt    1.151945
Yt    1.070904
Zt    1.118942
pv    1.232462
dtype: float64

In [14]:
def get_r(n):
    r = agent.env.portfolios[
        agent.env.portfolios['e'] == n
        ].set_index('date')
    return r

In [15]:
n = min(agent.env.portfolios['e']) + 1
n

12

In [16]:
r = get_r(n)

In [17]:
r[['xt', 'yt', 'zt']].mean()

xt    0.947162
yt    0.008528
zt    0.044310
dtype: float64

In [18]:
r[['xt', 'yt', 'zt']].std()

xt    0.132251
yt    0.044254
zt    0.106386
dtype: float64

In [19]:
#r[['xt', 'yt', 'zt']].plot(title='ALLOCATIONS [%]', style=['g--', 'b:', 'm-.'], lw=1, grid=True).plt.ylabel('allocation')

In [20]:
cols = ['Xt', 'Yt', 'Zt', 'pv']

In [21]:
sub = r[cols]

In [22]:
rets = sub.iloc[-1] / sub.iloc[0] - 1
rets

Xt    0.363015
Yt    0.837442
Zt    0.177179
pv    0.531629
dtype: float64

In [23]:
stds = sub.pct_change().std() * math.sqrt(252)
stds

Xt    0.262097
Yt    1.540253
Zt    0.166215
pv    0.231560
dtype: float64

In [24]:
rets / stds

Xt    1.385041
Yt    0.543704
Zt    1.065963
pv    2.295856
dtype: float64

In [25]:
#sub.plot(style=['g--', 'b:', 'm-.', 'r-'], lw=1)
#plt.ylabel('value');

In [26]:
sharpe = pd.DataFrame()

In [27]:
def calculate_sr():
    for n in set(investing.portfolios['e']):
        r = get_r(n)
        sub = r[cols]
        rets = sub.iloc[-1] / sub.iloc[0] - 1
        stds = sub.pct_change().std() * math.sqrt(252)
        sharpe[n] = rets / stds

In [28]:
calculate_sr()

In [29]:
sharpe.round(2)

Unnamed: 0,11,12,13,14,15,16,17,18,19,20
Xt,1.19,1.39,2.26,0.33,2.75,-0.11,0.62,1.06,-0.09,0.37
Yt,0.14,0.54,-0.29,0.21,0.03,-0.43,0.04,0.1,-0.2,0.12
Zt,1.64,1.07,0.79,-0.56,-0.7,1.02,0.13,3.61,0.58,0.61
pv,0.99,2.3,2.63,2.49,5.36,-0.25,0.71,2.0,-0.31,0.15


In [30]:
sharpe.mean(axis=1)

Xt    0.976497
Yt    0.026487
Zt    0.817586
pv    1.606219
dtype: float64

In [31]:
((sharpe.loc['pv'] > sharpe.loc['Xt']) &
 (sharpe.loc['pv'] > sharpe.loc['Yt']) &
 (sharpe.loc['pv'] > sharpe.loc['Zt'])).value_counts()

False    5
True     5
Name: count, dtype: int64

## Equally Weighted Portfolio

In [32]:
agent.opt_action = lambda state: np.ones(3) / 3

In [33]:
agent.env.portfolios = pd.DataFrame()

In [34]:
%time agent.test(10)

episode=10 | total reward=1.45
CPU times: total: 3.25 s
Wall time: 5.4 s


In [35]:
sharpe = pd.DataFrame()

In [36]:
calculate_sr()

In [37]:
sharpe.round(2)

Unnamed: 0,21,22,23,24,25,26,27,28,29,30
Xt,1.91,0.42,2.56,-0.05,1.68,2.21,1.65,2.03,3.43,0.67
Yt,0.02,0.04,0.34,-0.53,0.02,-0.34,0.26,0.25,-0.03,0.04
Zt,1.57,-0.25,0.69,0.88,0.35,1.27,2.33,1.54,-0.98,-0.38
pv,1.96,1.35,1.91,0.39,1.41,0.91,2.98,2.88,1.05,1.38


In [38]:
sharpe.mean(axis=1)

Xt    1.651726
Yt    0.006942
Zt    0.702165
pv    1.620502
dtype: float64

In [39]:
((sharpe.loc['pv'] > sharpe.loc['Xt']) &
 (sharpe.loc['pv'] > sharpe.loc['Yt']) &
 (sharpe.loc['pv'] > sharpe.loc['Zt'])).value_counts()

True     5
False    5
Name: count, dtype: int64

<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>