In [2]:
%load_ext autoreload
%autoreload 2

#standard imports
import numpy as np
import pandas as pd
import numpy as np
import pandas as pd
import sys
sys.path.append("..")

#rl book imports
import rl
from rl.markov_decision_process import MarkovDecisionProcess
from rl.markov_process import State, MarkovProcess, NonTerminal, Terminal

from typing import (Callable, Dict, Generic, Iterator, Iterable, List,
                    Mapping, Optional, Sequence, Tuple, TypeVar, overload)

from rl.distribution import Categorical, Distribution, Constant, Choose
from rl.policy import Policy
from rl.monte_carlo import epsilon_greedy_policy, greedy_policy_from_qvf, glie_mc_control
from rl.function_approx import LinearFunctionApprox, AdamGradient
from rl.td import glie_sarsa, q_learning


#custom imports 
import utils as u
import data as dat
import mdp_agent as ag
import baseline_policies as bp
import q_plots as qp
import backtest as btest

## 1. Finite Horizon Trading MDP for mean reverting residuals. 

The goal of this part is to see wether an RL agent can an optimal trading rule using a simple mean reverting residual process

In [3]:
mu=100
sigma = 30
train, test = dat.build_simulated_train_test(N=10,mu=mu,sigma=sigma)
u.plot_plotly_multiple(train)

Some algorithms we have are :
- `glie_mc_control` (p. 352)
- `glie_sarsa` (p. 358)
-  `q_learning` but needs also `policy_from_q: PolicyFromQType`(p.365)
- `q_learning_experience_replay`but needs also `policy_from_q: PolicyFromQType`(p.393)
- `least_squares_policy_iteration` but takes in argument `initial_target_policy: DeterministicPolicy[S, A]` and `transitions: Iterable[TransitionStep[S, A]]`

#### Baseline trading policies

In [4]:
trader = ag.Trading(train,test)
threshold_policy = bp.ThresholdTradingPolicy(enter_long = mu-0.2*sigma, exit_long=mu-0.05*sigma, enter_short = mu+0.2*sigma, exit_short = mu+0.05*sigma)
bt = btest.Backtester(trader, threshold_policy)
bt.summary()


Argument `closed` is deprecated in favor of `inclusive`.



Define PolicyFromQType

In [99]:
def pol_from_q(q,mdp):
    return epsilon_greedy_policy(q,mdp,ε=0.5)

Train

In [106]:
mdp_trading = ag.Trading(train,test)

approx_0  = mdp_trading.build_q_approx()

states = mdp_trading.generate_start_state("train")


qvfs_q_learn = q_learning(
    mdp_trading,
    pol_from_q,
    states,
    approx_0,
    0.9,
    1000
)


Get q value

In [107]:
num_iter = 1000000
final_qvf = None
for i,qvf in enumerate(qvfs_q_learn):
    if i>=num_iter:
        break
    final_qvf = qvf

Visualize Q

In [109]:
qanalysis = qp.QAnalyzer(test,final_qvf)
qanalysis.plot_snapshot()

Backtest

In [110]:
trading_policy  = greedy_policy_from_qvf(final_qvf, lambda x : [-1,0,1])
bt = btest.Backtester(mdp_trading,trading_policy)
bt.summary()

In [111]:
u.plot_plotly(test)