# Deep Deterministic Policy Gradients (DDPG)

Paper: [Continuous Control With Deep Reinforcement Learning](https://arxiv.org/pdf/1509.02971.pdf)

## Objective Function and Updating
### 1. Actor
Maximize the expected rewards.

$$\bigtriangledown_{\theta^{\mu}} J \approx \frac{1}{N} \sum_i{  \bigtriangledown_a   Q(s,a | \theta^{Q})   |_{s=s_i, a=\mu(s_i)}  \bigtriangledown_{\theta^{\mu}} \mu(s|\theta^{\mu})  |_{s_i}     }$$


### 2. Critic
MSE Loss with TD error:

$$y_i = r_i  + \gamma Q_{target}(s_{i+1},     \mu_{target}(s_{i+1}|\theta^{\mu_{target}})     | \theta^{Q_{target}})$$
$$L = \frac{1}{N}\sum_i{ (y_i - Q(s_i, a_i | {\theta}^Q))^2  }$$



### 3. Update the target networks: Soft Update
$$\theta^{Q_{target}} \leftarrow \tau \theta^Q + (1 - \tau)\theta^{Q_{target}}$$
$$\theta^{\mu_{target}} \leftarrow \tau \theta^{\mu} + (1 - \tau)\theta^{\mu_{target}}$$

where $\tau = 0.005$ for example
## Points
+ Ornstein-Uhlenbeck Process
+ Update Network: soft update & update after training
+ Replay Buffer: save **transitions** rather than one sample

## Exploration
+ For **discrete** action spaces, exploration is done via probabitlistically selecting a random action (such as $\epsilon$-greedy or Boltzmann exploration)
+ For **continuous** action spaces, exploration is done via adding noise to the action itself. In the DDPG, the authors use *Ornstein-Uhlenbeck Process* to add noise to the action output

### Ornstein-Uhlenbeck Process
Generates noise that is correlated with the previous noise, as to prevent the noise from canceling out or "freezing" the overall dynamics.

$$dx_t = \theta (\mu - x_t) dt + \sigma dW_t$$
where $W_t$ denotes the Wiener Process.


## Pendulum-v0
+ action: (1, ). a list of a single element which is in the range of `[-2, 2]`. The example is `[-1.5]`
+ state: (3, )


## 1. Import packages

In [0]:
import gym
import random
import numpy as np
import collections
import torch
import torch.nn.functional as F
import torch.nn as nn

## 2. Define constants

In [0]:
gamma = 0.99
num_epochs = 3000
reward_div = 100
max_buffer = 50000
tau = 0.005
lr_mu = 0.0005
lr_q = 0.001
batch_size = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 3. Prepare data

In [0]:
class ReplayBuffer(object):
    def __init__(self):
        self.buffer = collections.deque(maxlen=max_buffer)
    
    def append(self, sample): # I store one sample in the ReplayBuffer, while the minimalRL stores 1 transition
        self.buffer.append(sample)
    
    def sample(self, n):
        return random.sample(self.buffer, n)
      
    def __len__(self):
        return len(self.buffer)
      
      
buffer = ReplayBuffer()

class OrnsteinUhlenbeckNoise(object):
    def __init__(self, mu):
        self.theta, self.dt, self.sigma = 0.1, 0.01, 0.1
        self.mu = mu
        self.x_prev = np.zeros_like(self.mu)

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
            self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

ou_noise = OrnsteinUhlenbeckNoise(mu=np.zeros(1)) # initialize with zeros and same shape as actions


env = gym.make("Pendulum-v0")


def get_sample(env, policy, buffer, max_iter=300):
    done = False
    s = env.reset() # (state_size, )

    ss, aa, rr, s_primes, done_masks = list(), list(), list(), list(), list()
    for t in range(max_iter):
        a = policy.sample_action(torch.Tensor(s).to(device)) # scalar?
        a = a.item() + ou_noise()[0] # OUNoise only relies on the previous noise
        s_prime, r, done, _ = env.step([a]) # a is 0 or 1
        ss.append(s)
        aa.append(a)
        rr.append(r)
        s_primes.append(s_prime)
        done_mask = 0.0 if done else 1.0
        done_masks.append(done_mask)
        buffer.append((s, a, r, s_prime, done_mask))
        s = s_prime
        if done:
            break
    
    sample = (torch.Tensor(ss).to(device), torch.FloatTensor(aa).to(device), torch.Tensor(rr).to(device), torch.Tensor(s_primes).to(device), torch.Tensor(done_masks).to(device))
    return sample

## 4. Build model

In [0]:


class Mu(nn.Module):
    def __init__(self):
        super(Mu, self).__init__()
        self.state2action = nn.Sequential(
            nn.Linear(3, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        
        self.optimizer = torch.optim.Adam(self.parameters(), lr=lr_mu, betas=(0.9, 0.99))


    def sample_action(self, states):
        '''
            states: (B, 3)
        '''
        actions = torch.tanh(self.state2action(states)) * 2
        return actions

class Q(nn.Module):
    def __init__(self):
        super(Q, self).__init__()
        self.fc_s = nn.Linear(3, 64)
        self.fc_a = nn.Linear(1, 64)
        self.fc1 = nn.Linear(128, 32)
        self.fc2 = nn.Linear(32, 1)
        
        self.optimizer = torch.optim.Adam(self.parameters(), lr=lr_q, betas=(0.9, 0.99))


    def value(self, states, actions):
        '''
            states: (B, 3)
            actions: (B, 1)
        '''
        net1 = F.relu(self.fc_s(states))
        net2 = F.relu(self.fc_a(actions))
        net = torch.cat([net1, net2], dim=1) # (B, 128)
        net = F.relu(self.fc1(net)) # (B, 32)
        return self.fc2(net) # (B, 1)

def fit(mu, q, mu_target, q_target, buffer):

    transitions = buffer.sample(batch_size)
    ss, aa, rr, s_primes, done_masks = [], [], [], [], []

    for transition in transitions:
        s, a, r, s_prime, done_mask = transition
        ss.append(s)
        aa.append(a)
        rr.append(r)
        s_primes.append(s_prime)
        done_masks.append([done_mask])

    s, a, r, s_prime, done_mask = (torch.Tensor(ss).to(device), torch.FloatTensor(aa).to(device),\
                                   torch.Tensor(rr).to(device), torch.Tensor(s_primes).to(device),\
                                   torch.Tensor(done_masks).to(device))

    
    
    # preprocess sample
    r /= reward_div 
    a = a.view(-1, 1) # (B, 1)

    q_prime = q_target.value(s_prime, mu_target.sample_action(s_prime)) # (B, 1)
    td_target = r.view(-1, 1) + gamma * q_prime * done_mask.view(-1, 1) # (B, 1)
    q_loss = F.smooth_l1_loss(q.value(s, a), td_target.detach())
    q.optimizer.zero_grad()
    q_loss.backward()
    q.optimizer.step()
    

    mu_loss = - torch.mean(q.value(s, mu.sample_action(s)))
    mu.optimizer.zero_grad()
    mu_loss.backward()
    mu.optimizer.step()
    

def soft_update(model, model_target):
    for param, param_target in zip(model.parameters(), model_target.parameters()):
        param_target.data.copy_(param_target.data * (1.0 - tau) + param.data * tau)

## 5. Train

In [5]:
q, q_target = Q().to(device), Q().to(device)
mu, mu_target = Mu().to(device), Mu().to(device)
q_target.load_state_dict(q.state_dict())
mu_target.load_state_dict(mu.state_dict())


score = 0.0

for epoch in range(num_epochs):
    # ------------------------- Get sample ---------------------------------------------
    sample = get_sample(env, mu, buffer)
    rewards = sample[2]
    score += sum(rewards)

    # ------------------------- Train Q Network using sample randomly chosen from Replay Buffer ---------------------------------------------
    if len(buffer) > 2000:
        for i in range(10):
            fit(mu, q, mu_target, q_target, buffer)
            soft_update(q, q_target) # !!! Update target network soon after the training. (Different from DQN)
            soft_update(mu, mu_target)


    if epoch % 20 == 0:
        print('Epoch %d || Average Score: %.6f'%(epoch, score / (epoch + 1)))        

Epoch 0 || Average Score: -1819.201416
Epoch 20 || Average Score: -1476.492065
Epoch 40 || Average Score: -1392.920166
Epoch 60 || Average Score: -1378.597900
Epoch 80 || Average Score: -1360.710571
Epoch 100 || Average Score: -1371.217651
Epoch 120 || Average Score: -1389.895386
Epoch 140 || Average Score: -1382.050903
Epoch 160 || Average Score: -1375.786377
Epoch 180 || Average Score: -1352.145630
Epoch 200 || Average Score: -1339.502441
Epoch 220 || Average Score: -1336.122070
Epoch 240 || Average Score: -1314.615356
Epoch 260 || Average Score: -1301.329834
Epoch 280 || Average Score: -1294.296753
Epoch 300 || Average Score: -1301.388550
Epoch 320 || Average Score: -1308.272583
Epoch 340 || Average Score: -1317.923828
Epoch 360 || Average Score: -1288.647949
Epoch 380 || Average Score: -1256.099243
Epoch 400 || Average Score: -1230.564209
Epoch 420 || Average Score: -1214.916992
Epoch 440 || Average Score: -1193.397705
Epoch 460 || Average Score: -1178.266968
Epoch 480 || Average S