# Импорт библиотек и данных

In [3]:
import pandas as pd
from indicators import RSI, extract_bb, MFI, CMF, ADX, ATR
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import warnings
import torch.nn as nn

from IPython.display import clear_output
import plotly.graph_objects as go
import plotly.express as px
from sb3_contrib import RecurrentPPO
import gym
from stable_baselines3.common.env_util import make_vec_env
from gym import spaces
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

warnings.filterwarnings("ignore")

In [4]:
df = pd.read_csv("sber_5min.csv")
df = df.dropna().reset_index(drop=True)
df = df.drop(columns=["<TICKER>", "<PER>", "<DATE>", "<TIME>"])
df.columns = ["open", "high", "low", "close", "volume"]
df

Unnamed: 0,open,high,low,close,volume
0,276.07,276.20,275.93,276.12,309450
1,276.11,276.79,276.10,276.55,360970
2,276.55,276.75,276.46,276.49,224500
3,276.48,276.91,276.48,276.89,271960
4,276.89,277.12,276.76,277.07,587810
...,...,...,...,...,...
11699,319.71,319.75,319.62,319.62,49780
11700,319.62,319.71,319.62,319.67,11100
11701,319.67,319.68,319.63,319.63,26130
11702,319.63,319.71,319.63,319.67,36970


# Создание индикаторов

In [5]:
n_steps = 14

prices = df["close"]

rsi_values = RSI(prices=prices, n_steps=n_steps)
bb_values = extract_bb(prices=prices, n_steps=n_steps)
mfi_values = MFI(
    open=df["open"],
    high=df["high"],
    low=df["low"],
    close=df["close"],
    volume=df["volume"],
    n_steps=n_steps,
)


cmf_values = CMF(
    open=df["open"],
    high=df["high"],
    low=df["low"],
    close=df["close"],
    volume=df["volume"],
    n_steps=n_steps,
)

ADX_values = ADX(
    high=df["high"],
    low=df["low"],
    close=df["close"],
    n_steps=3 * n_steps,
)

ATR_values = ATR(
    high=df["high"],
    low=df["low"],
    close=df["close"],
    n_steps=n_steps,
)

assert len(rsi_values) == len(
    bb_values
), f"Indicators length don't coincide: {len(rsi_values)} and {len(bb_values)}"

In [6]:
indicator_data = (
    pd.DataFrame(
        data=np.array(
            [rsi_values, bb_values, mfi_values, cmf_values, ADX_values, ATR_values]
        ).T,
        columns=["rsi", "bb", "mfi", "cmf", "adx", "atr"],
    )
    .dropna()
    .reset_index(drop=True)
    .astype(np.float64)
)
indicator_data

Unnamed: 0,rsi,bb,mfi,cmf,adx,atr
0,55.000000,0.136339,29.354004,0.212485,-16.340538,0.058710
1,50.000000,-1.362839,33.692567,0.351489,-20.262842,0.054379
2,48.837209,-0.973785,47.609903,0.392437,-28.840637,0.049325
3,26.829268,-2.291727,47.988862,0.400835,-35.719955,0.052216
4,23.157895,-2.476260,39.416791,0.265056,-44.018773,0.054868
...,...,...,...,...,...,...
11657,44.776119,-0.481382,31.875794,-0.491555,-2.070322,0.027532
11658,63.636364,0.453789,34.518385,-0.460541,-8.415577,0.027532
11659,52.000000,-0.232670,31.326292,-0.502769,-13.504626,0.027532
11660,52.941176,0.457834,46.579252,-0.313026,-8.255101,0.025445


In [7]:
indicator_data.describe()

Unnamed: 0,rsi,bb,mfi,cmf,adx,atr
count,11662.0,11662.0,11662.0,11662.0,11662.0,11662.0
mean,51.409455,0.055223,47.164017,0.068884,0.008006,0.066392
std,16.829651,1.237916,17.597642,0.238696,35.06559,0.041753
min,0.0,-3.510732,0.732035,-0.669093,-97.19934,0.007661
25%,39.728295,-0.906419,33.864992,-0.09949,-25.403567,0.038154
50%,50.819672,0.086532,46.762273,0.062559,-0.706845,0.056151
75%,62.962963,0.9949,59.749748,0.235795,25.384283,0.081615
max,99.418605,3.599744,98.749945,0.902805,94.782791,0.362483


# Обработка фичей

In [8]:
scaler = MinMaxScaler()
scaler.fit(indicator_data.iloc[:, :-1])
indicator_data.iloc[:, :-1] = scaler.transform(indicator_data.iloc[:, :-1])
indicator_data

Unnamed: 0,rsi,bb,mfi,cmf,adx,atr
0,0.553216,0.512915,0.292008,0.560837,0.421179,0.058710
1,0.502924,0.302074,0.336271,0.649268,0.400748,0.054379
2,0.491228,0.356790,0.478258,0.675317,0.356068,0.049325
3,0.269862,0.171438,0.482124,0.680660,0.320235,0.052216
4,0.232933,0.145486,0.394670,0.594281,0.277008,0.054868
...,...,...,...,...,...,...
11657,0.450380,0.426040,0.317735,0.112945,0.495510,0.027532
11658,0.640085,0.557561,0.344696,0.132675,0.462458,0.027532
11659,0.523041,0.461019,0.312129,0.105811,0.435951,0.027532
11660,0.532508,0.558129,0.467743,0.226520,0.463294,0.025445


In [9]:
indicator_data[["rsi", "bb"]].iloc[1].values

array([0.50292398, 0.30207449])

In [10]:
indicator_data

Unnamed: 0,rsi,bb,mfi,cmf,adx,atr
0,0.553216,0.512915,0.292008,0.560837,0.421179,0.058710
1,0.502924,0.302074,0.336271,0.649268,0.400748,0.054379
2,0.491228,0.356790,0.478258,0.675317,0.356068,0.049325
3,0.269862,0.171438,0.482124,0.680660,0.320235,0.052216
4,0.232933,0.145486,0.394670,0.594281,0.277008,0.054868
...,...,...,...,...,...,...
11657,0.450380,0.426040,0.317735,0.112945,0.495510,0.027532
11658,0.640085,0.557561,0.344696,0.132675,0.462458,0.027532
11659,0.523041,0.461019,0.312129,0.105811,0.435951,0.027532
11660,0.532508,0.558129,0.467743,0.226520,0.463294,0.025445


# Пересборка и разделение данных

In [11]:
all_data = indicator_data.copy()
all_data[df.columns] = df.iloc[len(df) - len(indicator_data) :, :].values

all_data

Unnamed: 0,rsi,bb,mfi,cmf,adx,atr,open,high,low,close,volume
0,0.553216,0.512915,0.292008,0.560837,0.421179,0.058710,277.00,277.08,276.92,277.08,106090.0
1,0.502924,0.302074,0.336271,0.649268,0.400748,0.054379,277.06,277.06,276.95,276.95,46240.0
2,0.491228,0.356790,0.478258,0.675317,0.356068,0.049325,276.95,276.98,276.78,276.98,90070.0
3,0.269862,0.171438,0.482124,0.680660,0.320235,0.052216,276.97,276.97,276.79,276.82,75920.0
4,0.232933,0.145486,0.394670,0.594281,0.277008,0.054868,276.79,276.83,276.65,276.69,121620.0
...,...,...,...,...,...,...,...,...,...,...,...
11657,0.450380,0.426040,0.317735,0.112945,0.495510,0.027532,319.71,319.75,319.62,319.62,49780.0
11658,0.640085,0.557561,0.344696,0.132675,0.462458,0.027532,319.62,319.71,319.62,319.67,11100.0
11659,0.523041,0.461019,0.312129,0.105811,0.435951,0.027532,319.67,319.68,319.63,319.63,26130.0
11660,0.532508,0.558129,0.467743,0.226520,0.463294,0.025445,319.63,319.71,319.63,319.67,36970.0


In [12]:

N = len(all_data)
split_size = 0.3

TRAIN_DATA, TEST_DATA = (
    all_data.iloc[: -int(split_size * N)],
    all_data.iloc[-int(split_size * N) :],
)

In [44]:
TRAIN_DATA

Unnamed: 0,rsi,bb,mfi,cmf,adx,atr,open,high,low,close,volume
0,0.553216,0.512915,0.292008,0.560837,0.421179,0.058710,277.00,277.08,276.92,277.08,106090.0
1,0.502924,0.302074,0.336271,0.649268,0.400748,0.054379,277.06,277.06,276.95,276.95,46240.0
2,0.491228,0.356790,0.478258,0.675317,0.356068,0.049325,276.95,276.98,276.78,276.98,90070.0
3,0.269862,0.171438,0.482124,0.680660,0.320235,0.052216,276.97,276.97,276.79,276.82,75920.0
4,0.232933,0.145486,0.394670,0.594281,0.277008,0.054868,276.79,276.83,276.65,276.69,121620.0
...,...,...,...,...,...,...,...,...,...,...,...
8159,0.484297,0.383490,0.593946,0.556736,0.415342,0.059891,307.06,307.16,307.01,307.12,67410.0
8160,0.384589,0.311237,0.497352,0.461012,0.414610,0.059023,307.12,307.19,307.00,307.05,118550.0
8161,0.254225,0.286258,0.377872,0.358139,0.413808,0.057722,307.07,307.11,307.00,307.00,137930.0
8162,0.262395,0.331151,0.365733,0.329553,0.386614,0.056857,307.00,307.09,306.94,307.02,128630.0


# Создание трейдинговой среды

In [13]:
steps_obs = 1
commision_bps = 0
rewards = []
Statistics = []

init_balance = 200
init_active = 10

start_price = all_data["close"].iloc[0]
init_pv = init_balance + init_active * start_price
rnn_steps = 2

MAX_BALANCE = 2 * init_pv
MAX_ACTIVE = 2 * init_pv / start_price


class CustomEnv(gym.Env):
    def __init__(self, df, init_balance, init_active, test=False):
        super(CustomEnv, self).__init__()
        self.df = df
        self.test = test
        self.seed = lambda x: 0
        self.current_step = 0

        self.observation_space = spaces.Box(
            low=0,
            high=1,
            shape=(rnn_steps, len(indicator_data.columns) + 2),
            dtype=np.float32,
        )
        self.balance = init_balance
        self.active = init_active
        self.init_balance = init_balance
        self.init_active = init_active
        self.current_price = self.df["close"].iloc[self.current_step]
        self.pv = self.balance + self.active * self.current_price

        self.action_space = spaces.Box(
            low=np.array([-1]),
            high=np.array([1]),
            dtype=np.float32,
        )

    def reset(self):

        print("reseting...")
        self.current_step = rnn_steps - 1
        if self.test:
            rewards.append([])
            Statistics.append([])

        init_state = np.append(
            self.df[indicator_data.columns]
            .iloc[self.current_step - rnn_steps + 1 : self.current_step + 1]
            .values,
            np.array(
                [
                    [self.balance / MAX_BALANCE, self.active / MAX_ACTIVE]
                    for i in range(rnn_steps)
                ]
            ),
            axis=1,
        )
        self.balance = init_balance
        self.active = init_active
        self.current_price = self.df["close"].iloc[self.current_step]
        self.pv = self.balance + self.init_active * self.current_price
        return init_state

    def reward_function(self, action):
        action = float(action)
        next_price = self.df["close"].iloc[self.current_step + 1]
        # покупаем
        if action > 0:
            money_to_spend = self.balance * action
            self.active += money_to_spend / self.current_price
            self.balance -= money_to_spend
        # продаем
        elif action < 0:
            active_to_sell = self.active * abs(action)
            self.active -= active_to_sell
            self.balance += active_to_sell * self.current_price
        new_pv = self.active * next_price + self.balance
        reward = (new_pv - self.pv) / self.pv * 100 

        return reward

    def step(self, action):

        reward = self.reward_function(action)
        self.current_step += 1
        if self.current_step < len(self.df.index) - (steps_obs + 1):
            next_state = np.append(
                self.df[indicator_data.columns]
                .iloc[self.current_step - rnn_steps + 1 : self.current_step + 1]
                .values,
                np.array(
                    [
                        [self.balance / MAX_BALANCE, self.active / MAX_ACTIVE]
                        for i in range(rnn_steps)
                    ]
                ),
                axis=1,
            )
            done = False
        else:
            next_state = np.zeros(self.observation_space.shape)
            done = True

    
        info = {}
        if self.test:
            rewards[-1].append(reward)
        self.current_price = self.df["close"].iloc[self.current_step]
        self.pv = self.active * self.current_price + self.balance

        return next_state, reward, done, info



train_env = CustomEnv(df=TRAIN_DATA, init_balance=init_balance, init_active=init_active)
train_env = DummyVecEnv([lambda: train_env])


test_env = CustomEnv(df=TEST_DATA, init_balance=init_balance, init_active=init_active)
test_env = DummyVecEnv([lambda: test_env])

# Перебор параметров модели

In [14]:
def main_func(params):
    print(params)
    policy_kwargs = dict(
        activation_fn=nn.ReLU,
        net_arch=dict(
            pi=[params["n_neurons"] for i in range(params["n_layers"])],
            vf=[params["n_neurons"] for i in range(params["n_layers"])],
        ),
        lstm_hidden_size=params["lstm_hidden_size"],
        n_lstm_layers=params["n_lstm_layers"],
    )
    batch_size = 256

    model = RecurrentPPO(
        "MlpLstmPolicy",
        train_env,
        verbose=1,
        device="cpu",
        policy_kwargs=policy_kwargs,
        learning_rate=0.0001,
        batch_size=batch_size,  
        n_epochs=5, 
        gamma=0.99, 
        n_steps=batch_size * 4,  
    )

    model.learn(total_timesteps=2000) 


    mean_reward, std_reward = evaluate_policy(model, train_env, n_eval_episodes=1)
    return mean_reward


In [15]:
import itertools

param_grid = {
    "n_layers": [2, 3, 10, 20],
    "n_neurons": [20, 30, 50, 300],
    "lstm_hidden_size": [8, 16, 32],
    "n_lstm_layers": [2, 5, 10],
}

keys = param_grid.keys()
values = param_grid.values()
combinations = list(itertools.product(*values))

results = []

for combination in combinations:
    param_dict = dict(zip(keys, combination))
    reward = main_func(param_dict)
    results.append(param_dict | dict(res=reward))

{'n_layers': 3, 'n_neurons': 10, 'lstm_hidden_size': 32, 'n_lstm_layers': 10}
Using cpu device
reseting...
-----------------------------
| time/              |      |
|    fps             | 392  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 1024 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 184          |
|    iterations           | 2            |
|    time_elapsed         | 11           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 5.403417e-07 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.42        |
|    explained_variance   | 0.00062      |
|    learning_rate        | 0.0001       |
|    loss                 | 0.0106       |
|    n_updates            | 5            |
|    policy_gradient_loss | -1.18e-05 

In [16]:

df_results = pd.DataFrame(results)
sorted_df = df_results.sort_values(by='res', ascending=False)

print(sorted_df)

    n_layers  n_neurons  lstm_hidden_size  n_lstm_layers        res
35         7        300               128             20  10.504894
20         5        300                32             10  10.504885
22         5        300               128             10  10.504869
23         5        300               128             20  10.504848
32         7        300                32             10  10.504516
9          3        300                32             20  10.504357
33         7        300                32             20  10.503478
34         7        300               128             10  10.500879
25         7         10                32             20  10.499059
16         5         20                32             10  10.494222
17         5         20                32             20  10.493040
24         7         10                32             10  10.483843
31         7         20               128             20  10.483367
1          3         10                32       

# Финальное обучение модели, тесты

In [17]:
policy_kwargs = dict(
    activation_fn=nn.ReLU,
    net_arch=dict(pi=[300, 300, 300], vf=[300, 300, 300]),
    lstm_hidden_size=32,
    n_lstm_layers=10,
)
batch_size = 1024
model = RecurrentPPO(
    "MlpLstmPolicy",
    train_env,
    verbose=1,
    device="cpu",
    policy_kwargs=policy_kwargs,
    learning_rate=0.0001,
    batch_size=batch_size, 
    n_epochs=5, 
    gamma=0.99,  
    n_steps=batch_size * 4, 
)

model.learn(total_timesteps=1500000) 

Using cpu device
reseting...
-----------------------------
| time/              |      |
|    fps             | 372  |
|    iterations      | 1    |
|    time_elapsed    | 11   |
|    total_timesteps | 4096 |
-----------------------------
reseting...
-----------------------------------------
| time/                   |             |
|    fps                  | 174         |
|    iterations           | 2           |
|    time_elapsed         | 46          |
|    total_timesteps      | 8192        |
| train/                  |             |
|    approx_kl            | 6.17918e-07 |
|    clip_fraction        | 0           |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.42       |
|    explained_variance   | -4.65e-05   |
|    learning_rate        | 0.0001      |
|    loss                 | 0.00983     |
|    n_updates            | 5           |
|    policy_gradient_loss | -2.97e-06   |
|    std                  | 1           |
|    value_loss           | 0.0158 

<sb3_contrib.ppo_recurrent.ppo_recurrent.RecurrentPPO at 0x14f8226d390>

In [18]:
mean_reward, std_reward = evaluate_policy(model, train_env, n_eval_episodes=3)
print(f"Mean reward: {mean_reward} +/- {std_reward}")

reseting...
reseting...
reseting...
reseting...
Mean reward: 10.49983421410434 +/- 0.0


In [19]:
mean_reward, std_reward = evaluate_policy(model, test_env, n_eval_episodes=3)
print(f"Mean reward: {mean_reward} +/- {std_reward}")

reseting...
reseting...
reseting...
reseting...
Mean reward: 4.0667058161925524 +/- 0.0


In [23]:
obs = train_env.reset()

num_iterations = 20
actions_taken = []
rewards = []

for _ in range(num_iterations):
    print(obs)
    action, _ = model.predict(obs, deterministic=False)

    obs, reward, done, _ = train_env.step(action)

    actions_taken.append(action)
    rewards.append(reward)

    if done:
        break

reseting...
[[[0.5532164  0.51291513 0.29200754 0.5608367  0.4211788  0.05870958
   0.         0.5000427 ]
  [0.50292397 0.3020745  0.3362705  0.6492676  0.40074822 0.05437868
   0.         0.5000427 ]]]
[[[0.50292397 0.3020745  0.3362705  0.6492676  0.40074822 0.05437868
   0.         0.5000158 ]
  [0.49122807 0.35679    0.4782582  0.6753171  0.35606804 0.04932488
   0.         0.5000158 ]]]
[[[0.49122807 0.35679    0.4782582  0.6753171  0.35606804 0.04932488
   0.         0.5000158 ]
  [0.26986164 0.17143789 0.48212442 0.68065983 0.32023492 0.05221552
   0.         0.5000158 ]]]
[[[0.26986164 0.17143789 0.48212442 0.68065983 0.32023492 0.05221552
   0.         0.5000158 ]
  [0.23293321 0.14548567 0.39467028 0.5942807  0.2770079  0.05486779
   0.         0.5000158 ]]]
[[[0.23293321 0.14548567 0.39467028 0.5942807  0.2770079  0.05486779
   0.         0.5000158 ]
  [0.31612363 0.29007798 0.4512735  0.6371567  0.23803216 0.05583813
   0.         0.5000158 ]]]
[[[0.31612363 0.29007798 0.4

In [24]:
test_df = all_data.iloc[rnn_steps - 1 : num_iterations + (rnn_steps - 1), :].copy()
test_df["actions"] = np.array(actions_taken).T[0][0]
test_df["rewards"] = np.array(rewards)

test_df

Unnamed: 0,rsi,bb,mfi,cmf,adx,atr,open,high,low,close,volume,actions,rewards
1,0.502924,0.302074,0.336271,0.649268,0.400748,0.054379,277.06,277.06,276.95,276.95,46240.0,1.0,0.010832
2,0.491228,0.35679,0.478258,0.675317,0.356068,0.049325,276.95,276.98,276.78,276.98,90070.0,1.0,-0.057766
3,0.269862,0.171438,0.482124,0.68066,0.320235,0.052216,276.97,276.97,276.79,276.82,75920.0,1.0,-0.046962
4,0.232933,0.145486,0.39467,0.594281,0.277008,0.054868,276.79,276.83,276.65,276.69,121620.0,1.0,0.039756
5,0.316124,0.290078,0.451274,0.637157,0.238032,0.055838,276.68,276.8,276.56,276.8,132040.0,-0.112023,0.009624
6,0.40686,0.339764,0.507127,0.655475,0.246646,0.052714,276.8,276.83,276.76,276.83,60110.0,0.079945,-0.0324
7,0.324467,0.272034,0.452004,0.58007,0.238837,0.051517,276.82,276.83,276.68,276.73,170730.0,1.0,0.036136
8,0.384589,0.38558,0.505455,0.641733,0.224973,0.052002,276.73,276.83,276.6,276.83,68580.0,1.0,-0.032511
9,0.353406,0.32355,0.459144,0.61066,0.213891,0.050562,276.82,276.83,276.71,276.74,44300.0,-0.188747,0.002931
10,0.335283,0.353102,0.513093,0.585471,0.196384,0.054417,276.73,276.83,276.62,276.75,63930.0,1.0,0.018067


In [25]:
fig = px.line(
    test_df, x=test_df.index, y="close", hover_data=["rsi", "bb", "mfi", "cmf", "adx", "atr", "actions", "rewards"]
)
fig.update_traces(mode="markers+lines")

fig.show()