# Deep Deterministic Policy Gradient (DDPG)

We introduce a python code for Deep Deterministic Policy Gradient (DDPG). Compared to methods using stochastic policy, where from given state $s$ the policy is defined as a probability distribution $\pi(\cdot|s)$ and the algorithm chooses one of the action based on that distribution, deterministic policy $\mu(s)$ provides a specific value from the given state $s$, hence called **deterministic**.  The paper can be found [here](https://arxiv.org/abs/1509.02971), and the code is modified from [This website](https://towardsdatascience.com/deep-deterministic-policy-gradients-explained-2d94655a9b7b)

In [None]:
import gym
import random
import numpy as np

import torch
import torch.autograd
import torch.nn            as nn
import torch.nn.functional as F 
import torch.optim         as optim

from collections import deque

# Actor and Critic Network (Separate)

For the DDPG algorithm, we split the Actor and Critic networks

## Critic 
The critic has four layers, and the critic learns the Q-function, $Q(s,a)$, hence the input is a concatenation of state and action, with a scalar output. 

## Actor
The actor network also has four layers, and it gets state vector as input and returns an action $a=\mu(s)$

In [None]:
class Critic( nn.Module ):

    def __init__( self, input_size, hidden_size, output_size ):

        super( Critic, self ).__init__()

        # First Layer
        self.linear1 = nn.Linear(  input_size, hidden_size )

        # Second Layer
        self.linear2 = nn.Linear( hidden_size, hidden_size )

        # Third Layer
        self.linear3 = nn.Linear( hidden_size, output_size )

    
    def forward( self, state, action ):

        # Concatenation of state and action vector
        x = torch.cat( [ state, action ] , 1 )

        # Applying Rectified Linear Unit (ReLU) to x
        x = F.relu( self.linear1( x ) )

        # Applying Rectified Linear Unit (ReLU) to x
        x = F.relu( self.linear2( x ) )

        # A simple Ax + b combination 
        x = self.linear3( x )

        return x

class Actor( nn.Module ):

    def __init__( self, input_size, hidden_size, output_size ):

        super( Actor, self ).__init__( )

        # First Layer
        self.linear1 = nn.Linear(  input_size, hidden_size )

        # Second Layer
        self.linear2 = nn.Linear( hidden_size, hidden_size )

        # Third Layer
        self.linear3 = nn.Linear( hidden_size, output_size )
        
    def forward( self, state ):
        
        # Applying Rectified Linear Unit (ReLU) to x
        x = F.relu( self.linear1( state ) )

        # Applying Rectified Linear Unit (ReLU) to x
        x = F.relu( self.linear2( x ) )

        # Applying to tanh, which ranges the value from -1 to +1
        x = torch.tanh( self.linear3( x ) ) 

        return x

# Replay Buffer
As in Deep Q-Network (DQN), we construct a replay buffer. As mentioned in [this paper](https://arxiv.org/abs/1509.02971), optimization algorithms assume that the samples are independently and identically distributed, and the replay buffer addresses that problem. 

In [None]:
class Memory:

    def __init__( self, max_size ):
        self.buffer = deque( maxlen = max_size )
    
    def push( self, state, action, reward, next_state, done ):
        
        experience = ( state, action, np.array( [ reward ] ), next_state, done )

        # Insert the tuple of (S A R S) into to the right end of the buffer deque.
        self.buffer.append( experience )

    def sample( self, batch_size ):
        
        # The sample is simply an array of 
        # Sn An Rn+1 Sn+1
        # Where S, A and R are variables that are self-explanatory.  
        state_batch      = [ ]
        action_batch     = [ ]
        reward_batch     = [ ] 
        next_state_batch = [ ]
        done_batch       = [ ]

        # Sample batch_size amount of list from the buffer
        batch = random.sample( self.buffer, batch_size )

        for experience in batch:

            state, action, reward, next_state, done = experience

            state_batch.append(       state      )
            action_batch.append(      action     )
            reward_batch.append(      reward     )
            next_state_batch.append(  next_state )
            done_batch.append(        done       )
        
        return state_batch, action_batch, reward_batch, next_state_batch, done_batch

    def __len__( self ):
        return len( self.buffer )

# The Ornstein-Uhlenbeck Process

The Ornstein-Uhlenbeck Process generates noise that is correlated with the previous noise, as to prevent the noise from canceling out or “freezing” the overall dynamics. Adding this noise is mentioned in the [original paper](https://arxiv.org/abs/1509.02971). [Wikipedia](https://en.wikipedia.org/wiki/Ornstein%E2%80%93Uhlenbeck_process) provides a thorough explanation of the Ornstein-Uhlenbeck Process. The source code is from [this](https://github.com/vitchyr/rlkit/blob/master/rlkit/exploration_strategies/ou_strategy.py). The Ornstein-Uhlenbeck process with an additional drift term can be described as:
$$
    dx_t = \theta (\mu - x_t)dt + \sigma d W_t
$$
where $W_t$ denotes the [Wiener process](https://en.wikipedia.org/wiki/Wiener_process). 


In [None]:

class OUNoise( object ):
    def __init__( self, action_space, mu = 0.0, theta = 0.15, max_sigma = 0.3, min_sigma = 0.3, decay_period = 100000 ):
        self.mu           = mu
        self.theta        = theta
        self.sigma        = max_sigma
        self.max_sigma    = max_sigma
        self.min_sigma    = min_sigma
        self.decay_period = decay_period
        self.action_dim   = action_space.shape[ 0 ]
        self.low          = action_space.low
        self.high         = action_space.high
        self.reset()
        
    def reset( self ):
        self.state = np.ones( self.action_dim ) * self.mu
        
    def evolve_state( self ):
        
        # State re-definition
        x  = self.state

        # randn returns a sample from the standard (i.e., normal) distribution
        dx = self.theta * ( self.mu - x ) + self.sigma * np.random.randn( self.action_dim )

        # Time-increment. x_{n+1} = x_{n} + dx
        self.state = x + dx

        return self.state
    
    def get_action( self, action, t = 0 ): 
        
        ou_state   = self.evolve_state( )

        # For our case, we simply set the max_sigma and min_sigma the same, hence the sigma value is constant for us
        self.sigma = self.max_sigma - ( self.max_sigma - self.min_sigma ) * min( 1.0, t / self.decay_period )

        # Adding ou noise onto the action and then clipping it.
        return np.clip( action + ou_state, self.low, self.high )


class NormalizedEnv( gym.ActionWrapper ):
    """ 
        The pendulum v1's action min/max are -2/+2, respectively. 
        But the action output of tanh is -1 to +1, hence we need scale the action values between range [-2, +2] and [-1, +1]

        [REF] https://github.com/openai/gym/blob/master/gym/envs/classic_control/pendulum.py
        [REF] https://github.com/openai/gym/blob/master/gym/core.py
        [REF] https://www.gymlibrary.ml/content/wrappers/
    """
 
    def action( self, act ):
        """
            Action range must change from [-1, +1] (The actor network's output) to [-2, +2], which will be the input to the gym.
            Hence, simply multiply 2. Note that this method does not work for other gym environments, where the range differs. 
        """

        return 2 * act
        

# DDPG Agent
Note that when we develop the actor and critic networks, we also make the copy of those networks, called actor-target and critic-target networks. The details are again, explained in [this paper](https://arxiv.org/abs/1509.02971).

In [None]:

class DDPGagent( object ):
    def __init__( self, env, hidden_size=256, actor_learning_rate = 1e-4, critic_learning_rate = 1e-3, gamma = 0.99, tau = 1e-2, max_memory_size = 50000 ):

        # Params
        self.num_states  = env.observation_space.shape[ 0 ]
        self.num_actions = env.action_space.shape[ 0 ]

        # Actor Networks
        self.actor        = Actor( self.num_states, hidden_size, self.num_actions )
        self.actor_target = Actor( self.num_states, hidden_size, self.num_actions )

        # Critic Networks
        self.critic        = Critic( self.num_states + self.num_actions, hidden_size, self.num_actions )
        self.critic_target = Critic( self.num_states + self.num_actions, hidden_size, self.num_actions )

        self.gamma = gamma
        self.tau   = tau

        for target_param, param in zip( self.actor_target.parameters( ), self.actor.parameters( ) ):

            # Copy the parameters of the actor parameters to the target parameters
            target_param.data.copy_( param.data )

        for target_param, param in zip( self.critic_target.parameters( ), self.critic.parameters( ) ):

            # Copy the parameters of the actor parameters to the target parameters
            target_param.data.copy_(param.data)
        
        # Construct the replay-buffer. The memory size is 
        self.memory = Memory( max_memory_size )      

        self.critic_criterion  = nn.MSELoss()
        self.actor_optimizer   = optim.Adam(  self.actor.parameters( ), lr =  actor_learning_rate )
        self.critic_optimizer  = optim.Adam( self.critic.parameters( ), lr = critic_learning_rate )
    
    def get_action( self, state ):
        state  = torch.from_numpy( state ).float( ).unsqueeze( 0 )
        action = self.actor.forward( state )
        action = action.detach( ).numpy( )[ 0,0 ]

        return action
    
    def update( self, batch_size ):

        states, actions, rewards, next_states, _ = self.memory.sample( batch_size )

        states      = torch.FloatTensor( states      )
        actions     = torch.FloatTensor( actions     )
        rewards     = torch.FloatTensor( rewards     )
        next_states = torch.FloatTensor( next_states )
    
        # Critic loss        
        Qvals        = self.critic.forward( states, actions )
        next_actions = self.actor_target.forward( next_states )
        next_Q       = self.critic_target.forward( next_states, next_actions.detach( ) )
        Qprime       = rewards + self.gamma * next_Q
        critic_loss  = self.critic_criterion( Qvals, Qprime )

        # Actor loss
        policy_loss = - self.critic.forward( states, self.actor.forward( states ) ).mean( )
        
        # Update Actor network
        self.actor_optimizer.zero_grad( )
        policy_loss.backward( )
        self.actor_optimizer.step( )

        # Update Critic network
        self.critic_optimizer.zero_grad( )
        critic_loss.backward( ) 
        self.critic_optimizer.step( )

        # Update target networks 
        for target_param, param in zip( self.actor_target.parameters( ), self.actor.parameters( ) ):
            target_param.data.copy_( param.data * self.tau + target_param.data * ( 1.0 - self.tau ) )
       
        for target_param, param in zip( self.critic_target.parameters( ), self.critic.parameters( ) ):
            target_param.data.copy_( param.data * self.tau + target_param.data * ( 1.0 - self.tau ) )


In [None]:
import sys
import gym
import numpy as np

import matplotlib.pyplot as plt


env = NormalizedEnv( gym.make( "Pendulum-v1" )  )


agent = DDPGagent( env )
noise = OUNoise(env.action_space)
batch_size = 128
rewards = []
avg_rewards = []

for episode in range(50):
    state = env.reset()
    noise.reset()
    episode_reward = 0
    
    for step in range(500):
        action = agent.get_action(state)
        action = noise.get_action(action, step)
        new_state, reward, done, _ = env.step(action) 
        agent.memory.push(state, action, reward, new_state, done)
        
        if len(agent.memory) > batch_size:
            agent.update(batch_size)        
        
        state = new_state
        episode_reward += reward

        if done:
            sys.stdout.write("episode: {}, reward: {}, average _reward: {} \n".format(episode, np.round(episode_reward, decimals=2), np.mean(rewards[-10:])))
            break

    rewards.append(episode_reward)
    avg_rewards.append(np.mean(rewards[-10:]))

plt.plot(rewards)
plt.plot(avg_rewards)
plt.plot()
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()