In [61]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import utils_project_mehdi as u
import numpy as np
import pandas as pd
import sys
sys.path.append("..")
import rl
from rl.markov_decision_process import MarkovDecisionProcess
from rl.markov_process import State, MarkovProcess, NonTerminal, Terminal

from typing import (Callable, Dict, Generic, Iterator, Iterable, List,
                    Mapping, Optional, Sequence, Tuple, TypeVar, overload)

from rl.distribution import Categorical, Distribution, Constant, Choose
from rl.policy import Policy
from rl.monte_carlo import epsilon_greedy_policy, greedy_policy_from_qvf, glie_mc_control
from rl.function_approx import LinearFunctionApprox, AdamGradient

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Finite Horizon Trading MDP for mean reverting residuals. 

The goal of this part is to see wether an RL agent can an optimal trading rule using a simple mean reverting residual process

In [62]:
def generate_ou_process(sigma, mu, kappa, start_date, end_date, S0=100):
    """
    Generates a DataFrame with returns of an Ornstein-Uhlenbeck process over specific dates.

    Parameters:
    - sigma: Volatility of the process.
    - mu: Long-term mean level to which the process reverts.
    - kappa: Rate of reversion to the mean.
    - start_date: Start date of the simulation as a string (YYYY-MM-DD).
    - end_date: End date of the simulation as a string (YYYY-MM-DD).
    - S0: Initial value of the process, default is 100.

    Returns:
    - DataFrame with index as dates and a column 'Value' representing the evolution of the process.
    """
    dates = pd.date_range(start=start_date, end=end_date, freq='B')  # 'B' for business days
    n = len(dates)
    prices = np.zeros(n)
    prices[0] = S0
    dt = 1/252  # assuming 252 trading days in a year

    for t in range(1, n):
        dW = np.random.normal(0, np.sqrt(dt))  # increment of Wiener process
        prices[t] = prices[t-1] + kappa * (mu - prices[t-1]) * dt + sigma * dW

    return pd.DataFrame({
        'Value': prices
    }, index=dates)

def build_simulated_train_test(start='2019-01-01', end='2023-12-31', N = 100):
    #train
    train = []
    for _ in range(N):
        df = generate_ou_process(sigma=0.1, mu=100, kappa=7, start_date=start, end_date=end)
        train.append(df)

    #test
    df = generate_ou_process(sigma=0.1, mu=100, kappa=7, start_date=start, end_date=end)
    return train, df    

train, test = build_simulated_train_test(N=100)

In [63]:
u.plot_plotly_multiple(train)

We need to create :
- ` mdp: MarkovDecisionProcess[S, A]`
- `states: NTStateDistribution[S]`
- `approx_0: QValueFunctionApprox[S, A]`

As input of the control/prediction algorithms we have in the codebase. 
Some algorithms we have are :
- `glie_mc_control` (p. 352)
- `glie_sarsa` (p. 358)
-  `q_learning` but needs also `policy_from_q: PolicyFromQType`(p.365)
- `q_learning_experience_replay`but needs also `policy_from_q: PolicyFromQType`(p.393)
- `least_squares_policy_iteration` but takes in argument `initial_target_policy: DeterministicPolicy[S, A]` and `transitions: Iterable[TransitionStep[S, A]]`


In [64]:
class ThresholdTradingPolicy(Policy[Dict,int]):
    """
    Implements the policy that consists in buying/selling at given thresholds
    """

    def __init__(self, enter_long, exit_long, enter_short, exit_short):
        """
        Setting the thresolds for trading decisions
        """
        self.enter_long = enter_long
        self.enter_short = enter_short
        self.exit_long = exit_long
        self.exit_short = exit_short

    def act(self, state: NonTerminal[Dict])->Distribution[int]:
        St = state.state["Spot"] #current spot, correponds to "t-1" if "t" is the time at the end of the step
        t = state.state["date"]
        pos = state.state["position"] #is +1  -1 or 0 

        action = 0

        if St >= self.enter_short and pos == 0:
            action = -1 #enter short

        if St <= self.exit_short and pos == -1:
            action = 1  #buy back to exit short

        if St <= self.enter_long and pos == 0:
            action = 1 #enter long

        if St >= self.enter_long and pos == 1:
            action = -1 #sell to exit long

        return Constant(action)

In [65]:
def generate_initial_state_from_data(df):
    """
    generates the initial state dictionnary from the dataframes
    """
    S0 = df.iloc[0][0]
    t = df.index[0]
    pos = 0
    return NonTerminal(
        {
            "Spot" : S0,
            "position" : 0,
            "date" : t,
            "data" : df
        }
    )

In [66]:
class Trading(MarkovDecisionProcess[Dict,int]):
    """
    - train is a list of dataframes representing price processes we want to trade
    - test is a dataframe in which we want to evaluate the policy
    - a state is a dictionnary
        {
            "Spot" : price S_t
            "position" : long/short (-1 or 1)
            "date" : current time step
            "data" : dataframe with price process
        }
    - actions : hold, buy or sell
    """

    def __init__(self,train,test):
        self.train = train
        self.test = test
    
    def actions(self, state):
        return [-1,0,1] #short hold buy
  
    def step(self, state, action)->Distribution[Tuple[State[Dict],float]]:
        #get information about current state
        S_t_1 = state.state["Spot"] #current spot, correponds to "t-1" if "t" is the time at the end of the step
        t_1 = state.state["date"]
        data = state.state["data"]
        pos = state.state["position"] #is +1  -1 or 0 

        #Fetch next spot value and compute the return
        t, is_last = u.get_next(t_1, data)
        S_t = data.loc[t][0]
        r =  pos*(S_t - S_t_1)/S_t

        #build next state
        next_state = {
            "Spot" :  S_t,
            "position" : np.sign(pos+action),
            "date" : t,
            "data" : data
        }
        if is_last:
            next_state = Terminal(next_state)
        else:
            next_state = NonTerminal(next_state)
        return Constant((next_state,r))

    
    def generate_start_state(self,which = "train"):
        """
        Generates the initial distribution of the state from the available training data
        """
        if which == "train":
            return Choose( [generate_initial_state_from_data(train_) for train_ in self.train] )
        elif which == "test":
            return Constant(generate_initial_state_from_data(test))   

In [67]:
class Backtester():
    """
    This class is to visualize the backtest of a given trading policy
    """

    def __init__(self, trading, policy):
        self.trading = trading #MDP  
        self.policy = policy #policy for the MDP

    def get_returns(self):

        start_states = self.trading.generate_start_state("test") # we take the test set data of the trading policy
        sequence = self.trading.simulate_actions(start_states, self.policy)

        bt = [] #will be used to build the backtest dataframe

        # Loop through each element in the sequence
        for x in sequence:
            bt.append([x.reward, x.state.state["date"]])

        df = pd.DataFrame(bt, columns=['Reward', 'Date'])
        df['Date'] = pd.to_datetime(df['Date'])
        df.set_index('Date', inplace=True)
        self.returns = df


    def summary(self):
        """
        main summary of the backtest
        """
        self.get_returns()
        sharpe = (np.sqrt(252)*self.returns.mean()/self.returns.std())[0]
        u.plot_plotly((1+self.returns).cumprod(),title=f"Sharpe Ratio {round(sharpe,2)}")

In [68]:
bt = Backtester( Trading(train,test),  ThresholdTradingPolicy(enter_long = 99.95, exit_long=99.99, enter_short = 100.05, exit_short = 100.025))
bt.summary()

In [84]:
approx_0.weights.adam_gradient.learning_rate = 0.1

FrozenInstanceError: cannot assign to field 'learning_rate'

In [74]:
mdp_trading = Trading(train,test)

states = mdp_trading.generate_start_state("train")

#phi(s,a)
def generate_features(pos,act):
    #generate the phi(s,a) for a given position and action
    #for now we generate linear representation

    return [
        lambda x : 1 if ((x[0].state["position"]==pos)and (x[1] ==act)) else 0, #intercept 
        lambda x : x[0].state["Spot"] if (x[0].state["position"]==pos and (x[1] ==act)) else 0, #slope with respect to spot
    ]

ffs=[ ]
for (pos,a) in [(-1,-1),(-1,0),(-1,1), (0,-1),(0,0),(0,1), (1,-1),(1,0),(1,1)]:
    ffs+= generate_features(pos,a)



approx_0  = LinearFunctionApprox.create(feature_functions=ffs)


epsilon_as_func_of_episodes = lambda k: 1/k

gamma = 0.9


qvfs = glie_mc_control(
    mdp_trading,
    states,
    approx_0,
    gamma,
    epsilon_as_func_of_episodes
)

In [75]:
import itertools
import rl.iterate as iterate
import pickle

In [79]:
num_episodes = 100000

final_qvf = None

for i,qvf in enumerate(qvfs):
    final_qvf=qvf
    if i >= num_episodes:
        break
    if ((i+1)%10)==0:
        print(i)

9
19
29
39
49
59
69
79
89
99
109
119
129
139
149
159
169
179
189
199
209
219
229
239
249
259
269
279
289
299
309
319
329
339
349
359
369
379
389
399
409
419
429
439
449
459
469
479
489
499
509
519
529
539
549
559
569
579
589
599
609
619
629
639
649
659
669
679
689
699
709
719
729
739
749
759
769
779
789
799
809
819
829
839
849
859
869
879
889
899
909
919
929
939
949
959
969
979
989
999
1009
1019
1029
1039
1049
1059
1069
1079
1089
1099
1109
1119
1129
1139
1149
1159
1169
1179
1189
1199
1209
1219
1229
1239
1249
1259
1269
1279
1289
1299
1309
1319
1329
1339
1349
1359
1369
1379
1389
1399


KeyboardInterrupt: 

## Visualizing the Q function

In [77]:
final_qvf

LinearFunctionApprox(feature_functions=[<function generate_features.<locals>.<lambda> at 0x12f163010>, <function generate_features.<locals>.<lambda> at 0x1386edd80>, <function generate_features.<locals>.<lambda> at 0x12fbd3640>, <function generate_features.<locals>.<lambda> at 0x138578700>, <function generate_features.<locals>.<lambda> at 0x138578670>, <function generate_features.<locals>.<lambda> at 0x138578b80>, <function generate_features.<locals>.<lambda> at 0x138578ee0>, <function generate_features.<locals>.<lambda> at 0x138578ca0>, <function generate_features.<locals>.<lambda> at 0x138579a20>, <function generate_features.<locals>.<lambda> at 0x138578f70>, <function generate_features.<locals>.<lambda> at 0x13857b5b0>, <function generate_features.<locals>.<lambda> at 0x1385796c0>, <function generate_features.<locals>.<lambda> at 0x138578280>, <function generate_features.<locals>.<lambda> at 0x13857b520>, <function generate_features.<locals>.<lambda> at 0x13857b640>, <function gener

In [78]:
import plotly.graph_objs as go
    # import plotly.offline as pyo

fig = go.Figure()

for (p,a) in [(-1,-1),(-1,0),(-1,1), (0,-1),(0,0),(0,1), (1,-1),(1,0),(1,1)]:

    qvals = []
    spotPrices = np.arange(100-2,100+2,0.01)

    for spot in spotPrices:
        state = NonTerminal({
            "Spot" : spot,
            "position" : p,
            "date" : pd.to_datetime("2023-01-01"),
            "data" : test
        })

        qvals.append(final_qvf((state,a)))

    fig.add_trace(go.Scatter(x=spotPrices, y=qvals, mode='lines', name=f'pos : {p}, act : {a}'))



fig.update_layout(title='Q values for different (spot,action,position) sates',
                  xaxis_title='Spot',
                  yaxis_title='Q value')

# Show the figure
fig.show()

## 2. Challenging the agent : introducing non stationarities

The goal of this part here is to see if the AI agent can learn to rapidly adapt to non stationarities in the data

Add features of the state space : fitted momentum in a lookback window.

## 3. Train the agent on real data

In [8]:
test

Unnamed: 0,Value
2019-01-01,100.000000
2019-01-02,99.995357
2019-01-03,100.004432
2019-01-04,100.001750
2019-01-07,100.008389
...,...
2023-12-25,99.994112
2023-12-26,100.001776
2023-12-27,100.007443
2023-12-28,100.014757


In [None]:
# prepare real data into [[episodes], test] where test is the start of the trading sample. 

