# DDPG

Задаем структуру аппроксимаций $\pi^\eta(s)$, $Q^\theta(s,a)$ и начальные вектора параметров $\eta$, $\theta$.

Для каждого эпизода делаем:

   Пока эпизод не закончен делаем:

- Находясь в состоянии $S_t$ совершаем действие

    $$
    A_t = \pi^\eta(S_t) + Noise,
    $$

    получаем награду $R_t$  переходим в состояние $S_{t+1}$. Сохраняем 
    $(S_t,A_t,R_t,D_t,S_{t+1}) \Rightarrow Memory$


- Берем $\{(s_i,a_i,r_i,d_i,s'_i)\}_{i=1}^{n} \leftarrow Memory$, определяем значения

    $$
    y_i = r_i + (1 - d_i) \gamma Q^\theta(s'_i,\pi^\eta(s'_i))
    $$
    функции потерь

    $$
    Loss_1(\theta) = \frac{1}{n}\sum\limits_{i=1}^n \big(y_i - Q^\theta(s_i,a_i)\big)^2,\quad Loss_2(\eta) = -\frac{1}{n}\sum\limits_{i=1}^n Q^\theta(s_i,\pi^\eta(s_i))
    $$

    и обновляем вектор параметров

    $$
    \theta \leftarrow \theta - \alpha \nabla_\theta Loss_1(\theta),\quad \eta \leftarrow \eta - \beta \nabla_\eta Loss_2(\eta),\quad \alpha,\beta > 0
    $$

- Уменьшаем $Noise$


In [1]:
#Ornstein–Uhlenbeck process (Процесс Орнштейна – Уленбека)

class OUNoise:
    def __init__(self, action_dimension, mu=0, theta=0.15, sigma=0.3):
        self.action_dimension = action_dimension
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.state = np.ones(self.action_dimension) * self.mu
        self.reset()

    def reset(self):
        self.state = np.ones(self.action_dimension) * self.mu

    def sample(self):
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x))
        self.state = x + dx
        return self.state

In [3]:
import numpy as np
import torch
import torch.nn as nn
import random
from collections import deque
from copy import deepcopy


class TwoLayersNeuralNetwork(nn.Module):
    def __init__(self, input_dim, layer1_dim, layer2_dim, output_dim, output_tanh):
        super().__init__()
        self.layer1 = nn.Linear(input_dim, layer1_dim)
        self.layer2 = nn.Linear(layer1_dim, layer2_dim)
        self.layer3 = nn.Linear(layer2_dim, output_dim)
        self.output_tanh = output_tanh
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        
    def forward(self, input):
        hidden = self.layer1(input)
        hidden = self.relu(hidden)
        hidden = self.layer2(hidden)
        hidden = self.relu(hidden)
        output = self.layer3(hidden)
        
        if self.output_tanh:
            return self.tanh(output)
        else:
            return output
        
        
class DDPG():
    def __init__(self, state_dim, action_dim, action_scale, noise_decrease,
                 gamma=0.99, batch_size=64, q_lr=1e-3, pi_lr=1e-4, tau=1e-2, memory_size=100000):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_scale = action_scale
        self.pi_model = TwoLayersNeuralNetwork(self.state_dim, 400, 300, self.action_dim, output_tanh=True)
        self.q_model = TwoLayersNeuralNetwork(self.state_dim + self.action_dim, 400, 300, 1, output_tanh=False)
        self.pi_target_model = deepcopy(self.pi_model)
        self.q_target_model = deepcopy(self.q_model)
        self.noise = OUNoise(self.action_dim)
        self.noise_threshold = 1
        self.noise_decrease = noise_decrease
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.q_optimazer = torch.optim.Adam(self.q_model.parameters(), lr=q_lr)
        self.pi_optimazer = torch.optim.Adam(self.pi_model.parameters(), lr=pi_lr)
        self.memory = deque(maxlen=memory_size)
    
    def get_action(self, state):
        pred_action = self.pi_model(torch.FloatTensor(state)).detach().numpy()
        action = self.action_scale * (pred_action + self.noise_threshold * self.noise.sample())
        return np.clip(action, -self.action_scale, self.action_scale)
    
    def update_target_model(self, target_model, model, optimazer, loss):
        optimazer.zero_grad()
        loss.backward()
        optimazer.step()
        for target_param, param in zip(target_model.parameters(), model.parameters()):
            target_param.data.copy_((1 - self.tau) * target_param.data + self.tau * param.data) 
    
    
    def fit(self, state, action, reward, done, next_state):
        self.memory.append([state, action, reward, done, next_state])
        
        if len(self.memory) > self.batch_size:
            batch = random.sample(self.memory, self.batch_size)
            states, actions, rewards, dones, next_states = map(torch.FloatTensor, zip(*batch))
            rewards = rewards.reshape(self.batch_size, 1)
            dones = dones.reshape(self.batch_size, 1)
            
            pred_next_actions = self.action_scale * self.pi_target_model(next_states)
            next_states_and_pred_next_actions = torch.cat((next_states, pred_next_actions), dim=1)
            targets = rewards + self.gamma * (1 - dones) * self.q_target_model(next_states_and_pred_next_actions)
            
            states_and_actions = torch.cat((states, actions), dim=1)
            temp = (self.q_model(states_and_actions) - targets.detach())
            q_loss = torch.mean((targets.detach() - self.q_model(states_and_actions)) ** 2)
            self.update_target_model(self.q_target_model, self.q_model, self.q_optimazer, q_loss)
            
            pred_actions = self.action_scale * self.pi_model(states)
            states_and_pred_actions = torch.cat((states, pred_actions), dim=1)
            pi_loss = - torch.mean(self.q_model(states_and_pred_actions))
            self.update_target_model(self.pi_target_model, self.pi_model, self.pi_optimazer, pi_loss)
            
        if self.noise_threshold > 0:
            self.noise_threshold = max(0, self.noise_threshold - self.noise_decrease)

In [4]:
import gym

episode_n = 200
trajectory_len = 200

env = gym.make('Pendulum-v1')
agent = DDPG(state_dim=3, action_dim=1, action_scale=2, noise_decrease = 1 / (episode_n * trajectory_len))

for episode in range(episode_n):
    
    total_reward = 0
    state = env.reset()
    for _ in range(trajectory_len):
        action = agent.get_action(state)
        next_action, reward, done, _ = env.step(action)
        total_reward += reward
        
        agent.fit(state, action, reward, done, next_action)
        
        if done:
            break
            
        state = next_action
    
    print(f'episode={episode}, total_reward={total_reward}')

  deprecation(
  deprecation(
  states, actions, rewards, dones, next_states = map(torch.FloatTensor, zip(*batch))


episode=0, total_reward=-1334.8015564398079
episode=1, total_reward=-1525.8355889636084
episode=2, total_reward=-1646.2047885160314
episode=3, total_reward=-1570.9778542346894
episode=4, total_reward=-1109.3856326426808
episode=5, total_reward=-1307.9026756903793
episode=6, total_reward=-1232.0521245354862
episode=7, total_reward=-1163.0404773523933
episode=8, total_reward=-1042.2851789260446
episode=9, total_reward=-920.4526214979396
episode=10, total_reward=-467.1849950729043
episode=11, total_reward=-654.7507837418733
episode=12, total_reward=-921.0138331281235
episode=13, total_reward=-253.64803793336628
episode=14, total_reward=-665.215991697096
episode=15, total_reward=-515.7694358932431
episode=16, total_reward=-517.7784551260695
episode=17, total_reward=-636.1508766375658
episode=18, total_reward=-973.3759937808969
episode=19, total_reward=-254.72980598933873
episode=20, total_reward=-240.9448408417896
episode=21, total_reward=-497.75228509988625
episode=22, total_reward=-423.4

episode=183, total_reward=-127.69234814060512
episode=184, total_reward=-132.2390588999736
episode=185, total_reward=-250.32010098059897
episode=186, total_reward=-331.081512141525
episode=187, total_reward=-294.3771293976741
episode=188, total_reward=-7.186292460677919
episode=189, total_reward=-6.910015969813687
episode=190, total_reward=-127.77939049614344
episode=191, total_reward=-131.00873419896453
episode=192, total_reward=-6.887756208577514
episode=193, total_reward=-242.7954928830705
episode=194, total_reward=-125.16738509460522
episode=195, total_reward=-247.03015600011346
episode=196, total_reward=-133.06972137111734
episode=197, total_reward=-7.525640842115034
episode=198, total_reward=-309.01432746184423
episode=199, total_reward=-124.90500236960108
