In [4]:
import math
import random
from collections import deque, namedtuple

import gym
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from binance import Client
from gym import spaces

# =============================================================================
# 1. Получение данных (свечи) с Binance
# =============================================================================
client = Client()  # публичный клиент


def get_klines(
    symbol="ETHUSDT",
    interval="1m",
    start_date="6 day ago UTC",
    end_date="4 day ago UTC",
):
    try:
        klines = client.get_historical_klines(
            symbol=symbol, interval=interval, start_str=start_date, end_str=end_date
        )
        columns = [
            "time",
            "open",
            "high",
            "low",
            "close",
            "volume",
            "close_time",
            "quote_volume",
            "trades",
            "taker_buy_base",
            "taker_buy_quote",
            "ignore",
        ]
        df = pd.DataFrame(klines, columns=columns, dtype=float)
        df["time"] = pd.to_datetime(df["time"], unit="ms")
        return df[["time", "open", "high", "low", "close", "volume", "trades"]]
    except Exception as e:
        print(f"Ошибка при получении данных: {e}")
        return None


df = get_klines(symbol="ETHUSDT", interval="1m")
print(df.head())

if df is not None:
    N = len(df)
    ratio = 0.8
    df_test = df[int(N * ratio) :].reset_index(drop=True)
    df = df[: int(N * ratio)].reset_index(drop=True)
else:
    df = pd.DataFrame(
        columns=["time", "open", "high", "low", "close", "volume", "trades"]
    )
    df_test = pd.DataFrame(
        columns=["time", "open", "high", "low", "close", "volume", "trades"]
    )


                 time     open     high      low    close    volume  trades
0 2025-03-18 20:03:00  1905.24  1905.49  1904.36  1904.65   90.7889   655.0
1 2025-03-18 20:04:00  1904.64  1906.69  1904.64  1906.69  234.7526   926.0
2 2025-03-18 20:05:00  1906.69  1907.56  1906.57  1906.64  147.4473   949.0
3 2025-03-18 20:06:00  1906.65  1906.87  1905.36  1906.56   79.5244  1108.0
4 2025-03-18 20:07:00  1906.56  1907.39  1905.85  1907.01  109.2926  1188.0


In [3]:
gamma_range = [0.005, 1.0]
kappa_range = [1e-4, 1e-2]
vol_corr_range = [-0.01, 0.01]


# =============================================================================
# 3. Определение среды MarketMakingEnv
# =============================================================================
class MarketMakingEnv(gym.Env):
    """
    Окружение маркет-мейкинга по Avellaneda-Stoikov с использованием свечных данных.
    Состояние: [inventory, price history (относительные изменения цены, длина=history_len), PnL].
    Действие:
      - Если continuous=False: мультидискретный выбор (γ, κ, Δσ) из заданных списков.
      - Если continuous=True: непрерывное действие – вектор [γ, κ, Δσ] из заданных диапазонов.
    """

    def __init__(
        self,
        df,
        gamma_values=None,
        kappa_values=None,
        vol_corr_values=None,
        T=60.0,
        history_len=16,
        continuous=False,
    ):
        super(MarketMakingEnv, self).__init__()
        self.prices = df.close.values
        self.prices_high = df.high.values
        self.prices_low = df.low.values
        self.N = len(df)
        self.T = T  # торговый горизонт (сек)
        self.history_len = history_len
        self.continuous = continuous
        self.initial_cash = 10000.0
        self.fee_rate = 0.0
        if continuous:
            self.action_space = spaces.Box(
                low=np.array(
                    [gamma_range[0], kappa_range[0], vol_corr_range[0]],
                    dtype=np.float32,
                ),
                high=np.array(
                    [gamma_range[1], kappa_range[1], vol_corr_range[1]],
                    dtype=np.float32,
                ),
                dtype=np.float32,
            )
        else:
            self.gamma_values = gamma_values
            self.kappa_values = kappa_values
            self.vol_corr_values = vol_corr_values
            self.action_space = spaces.MultiDiscrete(
                [len(gamma_values), len(kappa_values), len(vol_corr_values)]
            )
        # Размер наблюдения: [inventory (1), price history (history_len), PnL (1)] = 1 + history_len + 1
        low = np.concatenate(
            (np.array([-100.0]), np.full((history_len,), -1.0), np.array([-1e9]))
        )
        high = np.concatenate(
            (np.array([100.0]), np.full((history_len,), 1.0), np.array([1e9]))
        )
        self.observation_space = spaces.Box(low, high, dtype=np.float32)

    def reset(self):
        self.t = 0
        self.inventory = 0.0
        self.cash = self.initial_cash
        self.price_change_history = [0.0] * self.history_len
        pnl = self.cash - self.initial_cash
        state = np.concatenate(
            ([self.inventory], np.array(self.price_change_history), [pnl])
        )
        return state.astype(np.float32)

    def step(self, action):
        # Определяем параметры из действия
        if self.continuous:
            gamma = float(action[0])
            kappa = float(action[1])
            vol_corr = float(action[2])
        else:
            n_gamma = len(self.gamma_values)
            n_kappa = len(self.kappa_values)
            n_vol = len(self.vol_corr_values)
            if isinstance(action, (np.ndarray, list, tuple)):
                gamma_idx = int(action[0])
                kappa_idx = int(action[1])
                vol_corr_idx = int(action[2])
            else:
                action = int(action)
                gamma_idx = action // (n_kappa * n_vol)
                rem = action % (n_kappa * n_vol)
                kappa_idx = rem // n_vol
                vol_corr_idx = rem % n_vol
            gamma = self.gamma_values[gamma_idx]
            kappa = self.kappa_values[kappa_idx]
            vol_corr = self.vol_corr_values[vol_corr_idx]
        mid_price = self.prices[self.t]
        sigma_est = 0.0
        if self.t > 1:
            start = max(0, self.t - int(self.T))
            window_prices = self.prices[start : self.t + 1]
            if len(window_prices) > 1:
                returns = np.diff(window_prices) / window_prices[:-1]
                sigma_est = np.std(returns)
        effective_sigma = sigma_est + vol_corr
        if effective_sigma < 1e-8:
            effective_sigma = 1e-8
        reservation_price = (
            mid_price - self.inventory * gamma * (effective_sigma**2) * self.T
        )
        delta = gamma * (effective_sigma**2) * self.T + (1.0 / gamma) * math.log(
            1 + gamma / kappa
        )
        bid_price = reservation_price - delta / 2
        ask_price = reservation_price + delta / 2
        done = False
        reward = 0.0
        if self.t < self.N - 1:
            next_price = self.prices[self.t + 1]
            if self.prices_high[self.t + 1] >= ask_price:
                self.inventory -= 1.0
                self.cash += ask_price * (1.0 - self.fee_rate)
            if self.prices_low[self.t + 1] <= bid_price:
                self.inventory += 1.0
                self.cash -= bid_price * (1.0 + self.fee_rate)
            current_value = self.cash + self.inventory * next_price
            prev_value = self.cash + self.inventory * mid_price
            pnl_change = current_value - prev_value
            reward = pnl_change - 0.001 * abs(self.inventory)
            self.t += 1
        else:
            done = True
            current_value = self.cash + self.inventory * mid_price
            reward = current_value - self.initial_cash
        if self.t > 0:
            rel_change = (self.prices[self.t] - self.prices[self.t - 1]) / self.prices[
                self.t - 1
            ]
        else:
            rel_change = 0.0
        self.price_change_history.pop(0)
        self.price_change_history.append(rel_change)
        if not done:
            new_mid = self.prices[self.t]
            pnl = self.cash + self.inventory * new_mid - self.initial_cash
            obs = np.concatenate(
                ([self.inventory], np.array(self.price_change_history), [pnl])
            )
            obs = obs.astype(np.float32)
        else:
            obs = None
        return obs, reward, done, {}


# =============================================================================
# 4. Нейросетевая архитектура: CombinedNetwork
# =============================================================================
class CombinedNetwork(nn.Module):
    def __init__(
        self,
        history_len=16,
        inv_pnl_dim=2,
        conv_out_channels=32,
        fc_inv_out=32,
        fc1_out=64,
        fc2_out=64,
    ):
        super(CombinedNetwork, self).__init__()
        # Сверточная ветвь для price history (вход: [batch, 1, history_len])
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3)
        self.conv2 = nn.Conv1d(
            in_channels=16, out_channels=conv_out_channels, kernel_size=3
        )
        # Выходной размер сверточной части: conv_out_channels * (history_len - 4)
        conv_feat_dim = conv_out_channels * (history_len - 4)
        # Полносвязная ветвь для inventory и PnL (2 значения)
        self.fc_inv = nn.Linear(inv_pnl_dim, fc_inv_out)
        # Основная часть объединяет выходы сверточной и MLP ветвей
        combined_dim = conv_feat_dim + fc_inv_out
        self.fc1 = nn.Linear(combined_dim, fc1_out)
        self.fc2 = nn.Linear(fc1_out, fc2_out)

    def forward(self, state):
        # state: [batch, 18] = [inventory (1), price_history (16), pnl (1)]
        inv = state[:, 0:1]  # [batch, 1]
        price_history = state[:, 1:17]  # [batch, 16]
        pnl = state[:, 17:18]  # [batch, 1]
        inv_pnl = torch.cat([inv, pnl], dim=1)  # [batch, 2]
        x_price = price_history.unsqueeze(1)  # [batch, 1, 16]
        x_price = torch.relu(self.conv1(x_price))  # -> [batch, 16, 14]
        x_price = torch.relu(self.conv2(x_price))  # -> [batch, conv_out_channels, 12]
        x_price = x_price.view(x_price.size(0), -1)  # [batch, conv_out_channels*12]
        x_inv = torch.relu(self.fc_inv(inv_pnl))  # [batch, fc_inv_out]
        x = torch.cat([x_price, x_inv], dim=1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return x  # [batch, fc2_out]


# =============================================================================
# 5. CombinedQNetwork для DQN
# =============================================================================
class CombinedQNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, history_len=16):
        super(CombinedQNetwork, self).__init__()
        self.base = CombinedNetwork(
            history_len=history_len,
            inv_pnl_dim=2,
            conv_out_channels=32,
            fc_inv_out=32,
            fc1_out=64,
            fc2_out=64,
        )
        self.out = nn.Linear(64, action_dim)

    def forward(self, state):
        x = self.base(state)
        q_values = self.out(x)
        return q_values


# =============================================================================
# 6. CombinedActorCritic для A2C и PPO
# =============================================================================
class CombinedActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, history_len=16):
        super(CombinedActorCritic, self).__init__()
        self.base = CombinedNetwork(
            history_len=history_len,
            inv_pnl_dim=2,
            conv_out_channels=32,
            fc_inv_out=32,
            fc1_out=64,
            fc2_out=64,
        )
        self.actor = nn.Linear(64, action_dim)
        self.critic = nn.Linear(64, 1)

    def forward(self, state):
        x = self.base(state)
        logits = self.actor(x)
        value = self.critic(x)
        return logits, value


# =============================================================================
# 7. DQNAgent (дискретный режим)
# =============================================================================
Transition = namedtuple(
    "Transition", ["state", "action", "reward", "next_state", "done"]
)


class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, *args):
        self.buffer.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)


class DQNAgent:
    def __init__(
        self,
        state_dim,
        action_dim,
        lr=1e-3,
        gamma=0.99,
        buffer_capacity=10000,
        batch_size=64,
        target_update=1000,
        history_len=16,
    ):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.q_network = CombinedQNetwork(
            state_dim, action_dim, history_len=history_len
        ).to(self.device)
        self.target_network = CombinedQNetwork(
            state_dim, action_dim, history_len=history_len
        ).to(self.device)
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        self.gamma = gamma
        self.buffer = ReplayBuffer(buffer_capacity)
        self.batch_size = batch_size
        self.target_update = target_update
        self.steps_done = 0
        self.action_dim = action_dim

    def select_action(self, state, epsilon):
        if random.random() < epsilon:
            return random.randrange(self.action_dim)
        else:
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            with torch.no_grad():
                q_values = self.q_network(state_tensor)
            return q_values.argmax().item()

    def push_transition(self, state, action, reward, next_state, done):
        self.buffer.push(state, action, reward, next_state, done)

    def update(self):
        if len(self.buffer) < self.batch_size:
            return
        transitions = self.buffer.sample(self.batch_size)
        batch = Transition(*zip(*transitions))
        state_batch = torch.FloatTensor(batch.state).to(self.device)
        action_batch = torch.LongTensor(batch.action).unsqueeze(1).to(self.device)
        reward_batch = torch.FloatTensor(batch.reward).unsqueeze(1).to(self.device)
        non_final_mask = torch.tensor(
            [s is not None for s in batch.next_state],
            dtype=torch.bool,
            device=self.device,
        )
        non_final_next_states = torch.FloatTensor(
            [s for s in batch.next_state if s is not None]
        ).to(self.device)
        done_batch = torch.FloatTensor(batch.done).unsqueeze(1).to(self.device)
        q_values = self.q_network(state_batch).gather(1, action_batch)
        next_q_values = torch.zeros(self.batch_size, 1).to(self.device)
        if non_final_next_states.size(0) > 0:
            next_q_values[non_final_mask] = self.target_network(
                non_final_next_states
            ).max(1, keepdim=True)[0]
        expected_q_values = reward_batch + (1 - done_batch) * self.gamma * next_q_values
        loss = nn.MSELoss()(q_values, expected_q_values.detach())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.steps_done += 1
        if self.steps_done % self.target_update == 0:
            self.target_network.load_state_dict(self.q_network.state_dict())


# =============================================================================
# 8. A2CAgent (дискретный режим)
# =============================================================================
class A2CAgent:
    def __init__(
        self,
        state_dim,
        action_dim,
        lr=1e-3,
        gamma=0.99,
        value_coef=0.5,
        entropy_coef=0.01,
        history_len=16,
    ):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = CombinedActorCritic(
            state_dim, action_dim, history_len=history_len
        ).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.gamma = gamma
        self.value_coef = value_coef
        self.entropy_coef = entropy_coef

    def select_action(self, state):
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        logits, value = self.model(state_tensor)
        probs = torch.softmax(logits, dim=1)
        action_tensor = torch.multinomial(probs, num_samples=1)
        log_prob = torch.log(probs.gather(1, action_tensor) + 1e-10)
        return (
            action_tensor.item(),
            log_prob.squeeze(),
            value.squeeze(),
            logits.squeeze(),
        )

    def update(self, trajectories):
        log_probs = torch.stack([t[2] for t in trajectories]).to(self.device)
        values = torch.stack([t[3] for t in trajectories]).to(self.device)
        rewards = [t[4] for t in trajectories]
        dones = [t[5] for t in trajectories]
        logits_list = torch.stack([t[6] for t in trajectories]).to(self.device)
        R = 0
        returns = []
        for reward, done in zip(reversed(rewards), reversed(dones)):
            R = reward + self.gamma * R * (1 - done)
            returns.insert(0, R)
        returns = torch.FloatTensor(returns).to(self.device)
        advantages = returns - values
        actor_loss = -(log_probs * advantages.detach()).mean()
        critic_loss = advantages.pow(2).mean()
        probs = torch.softmax(logits_list, dim=1)
        entropy = -(probs * torch.log(probs + 1e-10)).sum(dim=1).mean()
        loss = actor_loss + self.value_coef * critic_loss - self.entropy_coef * entropy
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()


# =============================================================================
# 9. PPOAgent (дискретный режим)
# =============================================================================
class PPOAgent:
    def __init__(
        self,
        state_dim,
        action_dim,
        lr=1e-3,
        gamma=0.99,
        clip_coef=0.2,
        value_coef=0.5,
        entropy_coef=0.01,
        ppo_epochs=4,
        history_len=16,
    ):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = CombinedActorCritic(
            state_dim, action_dim, history_len=history_len
        ).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.gamma = gamma
        self.clip_coef = clip_coef
        self.value_coef = value_coef
        self.entropy_coef = entropy_coef
        self.ppo_epochs = ppo_epochs

    def select_action(self, state):
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        logits, value = self.model(state_tensor)
        probs = torch.softmax(logits, dim=1)
        action_tensor = torch.multinomial(probs, num_samples=1)
        log_prob = torch.log(probs.gather(1, action_tensor) + 1e-10)
        return action_tensor.item(), log_prob.squeeze(), value.squeeze()

    def update(self, trajectories):
        states = torch.FloatTensor([t[0] for t in trajectories]).to(self.device)
        actions = (
            torch.LongTensor([t[1] for t in trajectories]).unsqueeze(1).to(self.device)
        )
        old_log_probs = torch.stack([t[2] for t in trajectories]).to(self.device)
        old_values = torch.stack([t[3] for t in trajectories]).to(self.device)
        rewards = [t[4] for t in trajectories]
        dones = [t[5] for t in trajectories]
        R = 0
        returns = []
        for reward, done in zip(reversed(rewards), reversed(dones)):
            R = reward + self.gamma * R * (1 - done)
            returns.insert(0, R)
        returns = torch.FloatTensor(returns).to(self.device)
        old_log_probs = old_log_probs.detach()
        old_values = old_values.detach()
        advantages = returns - old_values
        for _ in range(self.ppo_epochs):
            logits, values = self.model(states)
            values = values.squeeze(1)
            probs = torch.softmax(logits, dim=1)
            new_log_probs = torch.log(probs.gather(1, actions) + 1e-10).squeeze(1)
            ratio = torch.exp(new_log_probs - old_log_probs)
            adv_detached = advantages.detach()
            surr1 = ratio * adv_detached
            surr2 = (
                torch.clamp(ratio, 1.0 - self.clip_coef, 1.0 + self.clip_coef)
                * adv_detached
            )
            actor_loss = -torch.min(surr1, surr2).mean()
            critic_loss = (returns - values).pow(2).mean()
            entropy = -(probs * torch.log(probs + 1e-10)).sum(dim=1).mean()
            loss = (
                actor_loss + self.value_coef * critic_loss - self.entropy_coef * entropy
            )
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()


# =============================================================================
# 10. DDPGAgent (непрерывный режим)
# =============================================================================
class CombinedActorDDPG(nn.Module):
    def __init__(self, state_dim, action_dim=3, history_len=16):
        super(CombinedActorDDPG, self).__init__()
        self.base = CombinedNetwork(
            history_len=history_len,
            inv_pnl_dim=2,
            conv_out_channels=32,
            fc_inv_out=32,
            fc1_out=64,
            fc2_out=64,
        )
        self.out = nn.Linear(64, action_dim)

    def forward(self, state):
        x = self.base(state)
        a = torch.tanh(self.out(x))
        # Масштабирование выхода в диапазоны: γ, κ, Δσ
        gamma = gamma_range[0] + (gamma_range[1] - gamma_range[0]) * ((a[:, 0] + 1) / 2)
        kappa = kappa_range[0] + (kappa_range[1] - kappa_range[0]) * ((a[:, 1] + 1) / 2)
        vol_corr = vol_corr_range[0] + (vol_corr_range[1] - vol_corr_range[0]) * (
            (a[:, 2] + 1) / 2
        )
        action_out = torch.stack([gamma, kappa, vol_corr], dim=1)
        return action_out


class CombinedCriticDDPG(nn.Module):
    def __init__(self, state_dim, action_dim=3, history_len=16):
        super(CombinedCriticDDPG, self).__init__()
        self.base = CombinedNetwork(
            history_len=history_len,
            inv_pnl_dim=2,
            conv_out_channels=32,
            fc_inv_out=32,
            fc1_out=64,
            fc2_out=64,
        )
        self.out = nn.Linear(64 + action_dim, 1)

    def forward(self, state, action):
        x = self.base(state)
        x = torch.cat([x, action], dim=1)
        q = self.out(x)
        return q


class DDPGAgent:
    def __init__(
        self,
        state_dim,
        action_dim=3,
        lr_actor=1e-3,
        lr_critic=1e-3,
        gamma=0.99,
        tau=0.005,
        buffer_capacity=100000,
        batch_size=64,
        history_len=16,
    ):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.actor = CombinedActorDDPG(state_dim, action_dim, history_len).to(
            self.device
        )
        self.critic = CombinedCriticDDPG(state_dim, action_dim, history_len).to(
            self.device
        )
        self.target_actor = CombinedActorDDPG(state_dim, action_dim, history_len).to(
            self.device
        )
        self.target_critic = CombinedCriticDDPG(state_dim, action_dim, history_len).to(
            self.device
        )
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_critic)
        self.gamma = gamma
        self.tau = tau  # коэффициент для soft update
        self.batch_size = batch_size
        self.buffer = deque(maxlen=buffer_capacity)
        self.Experience = namedtuple(
            "Experience", ["state", "action", "reward", "next_state", "done"]
        )

    def select_action(self, state, noise_scale=0.1):
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        self.actor.eval()
        with torch.no_grad():
            action = self.actor(state_tensor).cpu().data.numpy().flatten()
        self.actor.train()
        action += noise_scale * np.random.randn(len(action))
        # Обеспечиваем, чтобы действие оставалось в заданном диапазоне
        action[0] = np.clip(action[0], gamma_range[0], gamma_range[1])
        action[1] = np.clip(action[1], kappa_range[0], kappa_range[1])
        action[2] = np.clip(action[2], vol_corr_range[0], vol_corr_range[1])
        return action

    def push_experience(self, state, action, reward, next_state, done):
        e = self.Experience(state, action, reward, next_state, done)
        self.buffer.append(e)

    def sample_batch(self):
        batch = random.sample(self.buffer, self.batch_size)
        states = torch.FloatTensor([e.state for e in batch]).to(self.device)
        actions = torch.FloatTensor([e.action for e in batch]).to(self.device)
        rewards = (
            torch.FloatTensor([e.reward for e in batch]).unsqueeze(1).to(self.device)
        )
        # Если next_state равен None, заменяем его нулевым вектором той же размерности, что state
        next_states = torch.FloatTensor(
            [
                e.next_state if e.next_state is not None else np.zeros_like(e.state)
                for e in batch
            ]
        ).to(self.device)
        dones = torch.FloatTensor([e.done for e in batch]).unsqueeze(1).to(self.device)
        return states, actions, rewards, next_states, dones

    def update(self):
        if len(self.buffer) < self.batch_size:
            return
        states, actions, rewards, next_states, dones = self.sample_batch()
        with torch.no_grad():
            next_actions = self.target_actor(next_states)
            target_q = self.target_critic(next_states, next_actions)
            y = rewards + self.gamma * (1 - dones) * target_q
        current_q = self.critic(states, actions)
        critic_loss = nn.MSELoss()(current_q, y)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        pred_actions = self.actor(states)
        actor_loss = -self.critic(states, pred_actions).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        self.soft_update(self.actor, self.target_actor)
        self.soft_update(self.critic, self.target_critic)

    def soft_update(self, source, target):
        for target_param, source_param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_(
                self.tau * source_param.data + (1 - self.tau) * target_param.data
            )


# =============================================================================
# 11. Метрики: максимальная просадка и коэффициент Шарпа
# =============================================================================
def compute_max_drawdown(pnl_history):
    pnl_array = np.array(pnl_history)
    running_max = np.maximum.accumulate(pnl_array)
    drawdowns = pnl_array - running_max
    return drawdowns.min()


def compute_sharpe_ratio(pnl_history):
    pnl_array = np.array(pnl_history)
    returns = np.diff(pnl_array)
    if returns.std() == 0:
        return 0.0
    sharpe = returns.mean() / returns.std() * np.sqrt(len(returns))
    return sharpe


# =============================================================================
# 12. Параметры для дискретных агентов (DQN, A2C, PPO) и для DDPG
# =============================================================================
# Для дискретного режима:
gamma_values = [0.005, 0.01, 0.05, 0.1, 0.5, 1.0]
kappa_values = [1e-4, 5e-4, 1e-3, 5e-3, 1e-2]
vol_corr_values = [-0.01, -0.005, 0.0, 0.005, 0.01]
# Создаем среду для дискретных агентов:
env = MarketMakingEnv(
    df=df,
    gamma_values=gamma_values,
    kappa_values=kappa_values,
    vol_corr_values=vol_corr_values,
    T=60.0,
    history_len=16,
    continuous=False,
)
state_dim = env.observation_space.shape[0]
if isinstance(env.action_space, spaces.MultiDiscrete):
    disc_action_dim = int(np.prod(env.action_space.nvec))
else:
    disc_action_dim = env.action_space.n
# Для DDPG – непрерывный режим:
env_cont = MarketMakingEnv(df=df, T=60.0, history_len=16, continuous=True)
cont_action_dim = 3

# =============================================================================
# 13. Инициализация агентов
# =============================================================================
dqn_agent = DQNAgent(
    state_dim,
    disc_action_dim,
    lr=1e-3,
    gamma=0.99,
    buffer_capacity=10000,
    batch_size=64,
    target_update=1000,
    history_len=16,
)
a2c_agent = A2CAgent(
    state_dim,
    disc_action_dim,
    lr=1e-3,
    gamma=0.99,
    value_coef=0.5,
    entropy_coef=0.01,
    history_len=16,
)
ppo_agent = PPOAgent(
    state_dim,
    disc_action_dim,
    lr=1e-3,
    gamma=0.99,
    clip_coef=0.2,
    value_coef=0.5,
    entropy_coef=0.01,
    ppo_epochs=4,
    history_len=16,
)
ddpg_agent = DDPGAgent(
    state_dim,
    action_dim=cont_action_dim,
    lr_actor=1e-3,
    lr_critic=1e-3,
    gamma=0.99,
    tau=0.005,
    buffer_capacity=100000,
    batch_size=64,
    history_len=16,
)

# =============================================================================
# 14. Параметры обучения
# =============================================================================
num_episodes = 20  # для DQN
num_episodes_a2c = 20  # для A2C
num_episodes_ppo = 20  # для PPO
num_episodes_ddpg = 20  # для DDPG
epsilon_start = 1.0
epsilon_final = 0.1
epsilon_decay = 300

# =============================================================================
# 15. Обучение дискретных агентов (DQN, A2C, PPO)
# =============================================================================
print("Обучение DQN-агента...")
for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0.0
    done = False
    while not done:
        epsilon = epsilon_final + (epsilon_start - epsilon_final) * math.exp(
            -env.t / epsilon_decay
        )
        action = dqn_agent.select_action(state, epsilon)
        next_state, reward, done, _ = env.step(action)
        dqn_agent.push_transition(state, action, reward, next_state, done)
        dqn_agent.update()
        state = next_state if next_state is not None else state
        total_reward += reward
    print(
        f"DQN Эпизод {episode+1}/{num_episodes}: суммарное вознаграждение = {total_reward:.2f}"
    )

print("\nОбучение A2C-агента...")
rollout_length = 10
for episode in range(num_episodes_a2c):
    state = env.reset()
    trajectories = []
    total_reward = 0.0
    done = False
    while not done:
        for _ in range(rollout_length):
            action, log_prob, value, logits = a2c_agent.select_action(state)
            next_state, reward, done, _ = env.step(action)
            trajectories.append(
                (state, action, log_prob, value, reward, float(done), logits)
            )
            total_reward += reward
            if done:
                break
            state = next_state
        a2c_agent.update(trajectories)
        trajectories = []
    print(
        f"A2C Эпизод {episode+1}/{num_episodes_a2c}: суммарное вознаграждение = {total_reward:.2f}"
    )

print("\nОбучение PPO-агента...")
rollout_length_ppo = 20
for episode in range(num_episodes_ppo):
    state = env.reset()
    trajectories = []
    total_reward = 0.0
    done = False
    while not done:
        for _ in range(rollout_length_ppo):
            action, log_prob, value = ppo_agent.select_action(state)
            next_state, reward, done, _ = env.step(action)
            trajectories.append((state, action, log_prob, value, reward, float(done)))
            total_reward += reward
            if done:
                break
            state = next_state
        ppo_agent.update(trajectories)
        trajectories = []
    print(
        f"PPO Эпизод {episode+1}/{num_episodes_ppo}: суммарное вознаграждение = {total_reward:.2f}"
    )

# =============================================================================
# 16. Обучение DDPG-агента (непрерывный режим)
# =============================================================================
print("\nОбучение DDPG-агента...")
for episode in range(num_episodes_ddpg):
    state = env_cont.reset()
    total_reward = 0.0
    done = False
    while not done:
        action = ddpg_agent.select_action(state, noise_scale=0.1)
        next_state, reward, done, _ = env_cont.step(action)
        ddpg_agent.push_experience(state, action, reward, next_state, done)
        ddpg_agent.update()
        state = next_state if next_state is not None else state
        total_reward += reward
    print(
        f"DDPG Эпизод {episode+1}/{num_episodes_ddpg}: суммарное вознаграждение = {total_reward:.2f}"
    )


# =============================================================================
# 17. Тестирование агентов и вычисление метрик
# =============================================================================
def run_agent(env, agent, agent_type="dqn"):
    state = env.reset()
    pnl_history = []
    done = False
    while not done:
        if agent_type == "dqn":
            action = agent.select_action(state, epsilon=0.0)
        elif agent_type == "a2c":
            action, _, _, _ = agent.select_action(state)
        elif agent_type == "ppo":
            action, _, _ = agent.select_action(state)
        elif agent_type == "ddpg":
            action = ddpg_agent.select_action(state, noise_scale=0.0)
        else:
            action = 0
        next_state, _, done, _ = env.step(action)
        if state is not None:
            pnl_history.append(state[-1])
        state = next_state if next_state is not None else state
    return pnl_history


env_test_disc = MarketMakingEnv(
    df=df_test,
    gamma_values=gamma_values,
    kappa_values=kappa_values,
    vol_corr_values=vol_corr_values,
    T=60.0,
    history_len=16,
    continuous=False,
)
dqn_pnl = run_agent(env_test_disc, dqn_agent, agent_type="dqn")
a2c_pnl = run_agent(env_test_disc, a2c_agent, agent_type="a2c")
ppo_pnl = run_agent(env_test_disc, ppo_agent, agent_type="ppo")
env_test_cont = MarketMakingEnv(df=df_test, T=60.0, history_len=16, continuous=True)
ddpg_pnl = run_agent(env_test_cont, ddpg_agent, agent_type="ddpg")

dqn_max_dd = compute_max_drawdown(dqn_pnl)
dqn_sharpe = compute_sharpe_ratio(dqn_pnl)
a2c_max_dd = compute_max_drawdown(a2c_pnl)
a2c_sharpe = compute_sharpe_ratio(a2c_pnl)
ppo_max_dd = compute_max_drawdown(ppo_pnl)
ppo_sharpe = compute_sharpe_ratio(ppo_pnl)
ddpg_max_dd = compute_max_drawdown(ddpg_pnl)
ddpg_sharpe = compute_sharpe_ratio(ddpg_pnl)

print("\nРезультаты тестирования:")
print(
    f"DQN итоговый PnL: {dqn_pnl[-1]:.2f}, Max Drawdown: {dqn_max_dd:.2f}, Sharpe Ratio: {dqn_sharpe:.2f}"
)
print(
    f"A2C итоговый PnL: {a2c_pnl[-1]:.2f}, Max Drawdown: {a2c_max_dd:.2f}, Sharpe Ratio: {a2c_sharpe:.2f}"
)
print(
    f"PPO итоговый PnL: {ppo_pnl[-1]:.2f}, Max Drawdown: {ppo_max_dd:.2f}, Sharpe Ratio: {ppo_sharpe:.2f}"
)
print(
    f"DDPG итоговый PnL: {ddpg_pnl[-1]:.2f}, Max Drawdown: {ddpg_max_dd:.2f}, Sharpe Ratio: {ddpg_sharpe:.2f}"
)

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


Обучение DQN-агента...
DQN Эпизод 1/20: суммарное вознаграждение = -245.53
DQN Эпизод 2/20: суммарное вознаграждение = -1036.89
DQN Эпизод 3/20: суммарное вознаграждение = 73.37
DQN Эпизод 4/20: суммарное вознаграждение = 425.35
DQN Эпизод 5/20: суммарное вознаграждение = -498.92
DQN Эпизод 6/20: суммарное вознаграждение = 375.87
DQN Эпизод 7/20: суммарное вознаграждение = -302.34
DQN Эпизод 8/20: суммарное вознаграждение = -666.54
DQN Эпизод 9/20: суммарное вознаграждение = 535.77
DQN Эпизод 10/20: суммарное вознаграждение = 634.01
DQN Эпизод 11/20: суммарное вознаграждение = -337.20
DQN Эпизод 12/20: суммарное вознаграждение = 342.84
DQN Эпизод 13/20: суммарное вознаграждение = -182.06
DQN Эпизод 14/20: суммарное вознаграждение = -89.78
DQN Эпизод 15/20: суммарное вознаграждение = 321.39
DQN Эпизод 16/20: суммарное вознаграждение = 709.24
DQN Эпизод 17/20: суммарное вознаграждение = -457.70
DQN Эпизод 18/20: суммарное вознаграждение = 1110.99
DQN Эпизод 19/20: суммарное вознаграждени