# Deterministic Policy Gradient Methods

#### Refs: 
- [Sharif University of Technology - Deep Reinforcement Learning (Fall 2024) - Dr.A.Emami and M.Narimani](https://github.com/mnarimani/DRL_Fall2024_SUT)

In [None]:
import gymnasium as gym
import numpy as np
import tensorflow as tf

from Lib.ActorCritic import ActorNetwork, CriticNetwork
from Lib.ReplayBuffer import ReplayBuffer

env_name = 'Pendulum-v1'

### Deep Deterministic Policy Gradient (DDPG)

In [None]:
class DDPG:
    def __init__(
        self,
        env,
        hidden_sizes=(300,),
        start_steps=1e4,
        replay_size=int(1e4),
        batch_size=100,
        gamma=0.99,
        decay=0.995,
        mu_lr=1e-3,
        q_lr=1e-3,
        action_noise=0.1,
        max_episode_length=200
    ):
        self.env = env
        self.gamma = gamma
        self.decay = decay
        self.batch_size = batch_size
        self.action_noise = action_noise
        self.start_steps = start_steps
        self.max_episode_length = max_episode_length
        
        # Extract environment dimensions
        self.num_states = self.env.observation_space.shape[0]
        self.num_actions = self.env.action_space.shape[0]
        self.action_max = self.env.action_space.high[0]
        
        # Create networks
        self.actor = ActorNetwork(env, hidden_sizes, True)
        self.critic = CriticNetwork(env, hidden_sizes)
        self.target_actor = ActorNetwork(env, hidden_sizes, True)
        self.target_critic = CriticNetwork(env, hidden_sizes)
        
        # Build networks (initialize weights)
        dummy_state = tf.zeros([1, self.num_states])
        dummy_action = tf.zeros([1, self.num_actions])
        self.actor(dummy_state)
        x = tf.concat([dummy_state, dummy_action], axis=-1)
        self.critic(x)
        self.target_actor(dummy_state)
        self.target_critic(x)
        
        # Copy weights to target networks
        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())
        
        # Create optimizers
        self.actor_optimizer = tf.keras.optimizers.Adam(mu_lr)
        self.critic_optimizer = tf.keras.optimizers.Adam(q_lr)
        
        # Create replay buffer
        self.replay_buffer = ReplayBuffer(self.num_states, self.num_actions, replay_size)
        
    def get_action(self, s, noise_scale):
        a = self.actor(tf.convert_to_tensor(s.reshape(1,-1), dtype=tf.float32))
        a = a.numpy()[0]  # Convert to numpy array
        a += noise_scale * np.random.randn(self.num_actions)
        return np.clip(a, -self.action_max, self.action_max)
    
    @tf.function
    def update(self, batch):
        states = tf.convert_to_tensor(batch['s'], dtype=tf.float32)
        states_next = tf.convert_to_tensor(batch['s2'], dtype=tf.float32)
        actions = tf.convert_to_tensor(batch['a'], dtype=tf.float32)
        rewards = tf.convert_to_tensor(batch['r'], dtype=tf.float32)
        dones = tf.convert_to_tensor(batch['d'], dtype=tf.float32)
        
        # Update critic
        with tf.GradientTape() as tape:
            # Target actions
            target_actions = self.target_actor(states_next)
            
            # Target Q-values
            x = tf.concat([states_next, target_actions], axis=-1)
            target_q = self.target_critic(x)
            
            # Q targets
            q_target = rewards + self.gamma * (1 - dones) * target_q
            
            # Current Q-values
            x = tf.concat([states, actions], axis=-1)
            q = self.critic(x)
            
            # Critic loss
            critic_loss = tf.reduce_mean((q - q_target)**2)
        
        # Get critic gradients
        critic_gradients = tape.gradient(critic_loss, self.critic.trainable_variables)
        
        # Apply critic gradients
        self.critic_optimizer.apply_gradients(
            zip(critic_gradients, self.critic.trainable_variables)
        )
        
        # Update actor
        with tf.GradientTape() as tape:
            # Actor actions
            actor_actions = self.actor(states)
            
            # Actor loss
            x = tf.concat([states, actor_actions], axis=-1)
            actor_loss = -tf.reduce_mean(self.critic(x))
        
        # Get actor gradients
        actor_gradients = tape.gradient(actor_loss, self.actor.trainable_variables)
        
        # Apply actor gradients
        self.actor_optimizer.apply_gradients(
            zip(actor_gradients, self.actor.trainable_variables)
        )
        
        return critic_loss, actor_loss
    
    def update_target_networks(self):
        # Update target networks using soft update
        for target, main in zip(self.target_actor.variables, self.actor.variables):
            target.assign(self.decay * target + (1 - self.decay) * main)
        
        for target, main in zip(self.target_critic.variables, self.critic.variables):
            target.assign(self.decay * target + (1 - self.decay) * main)
    
    def train(self, num_episodes):
        returns = []
        critic_losses = []
        actor_losses = []
        num_steps = 0
        
        for episode in range(num_episodes):
            state, episode_return, episode_length = self.env.reset()[0], 0, 0
            done = False
            
            while not (done or episode_length == self.max_episode_length):
                if num_steps > self.start_steps:
                    action = self.get_action(state, self.action_noise)
                else:
                    action = self.env.action_space.sample()
                
                num_steps += 1
                if num_steps == 1:
                    print(f"Using random actions for the initial {self.start_steps} steps...")
                if num_steps == self.start_steps:
                    print(f"{self.start_steps} steps reached. Using agent actions from now on.")
                
                # Take action in environment
                next_state, reward, done, _, _ = self.env.step(action)
                episode_return += reward
                episode_length += 1
                
                # Store transition
                done_store = False if episode_length == self.max_episode_length else done
                self.replay_buffer.store(state, action, reward, next_state, done_store)
                
                # Update state
                state = next_state
                
                # Update networks
                if num_steps > self.batch_size and self.replay_buffer.size >= self.batch_size:
                    batch = self.replay_buffer.sample_batch(self.batch_size)
                    critic_loss, actor_loss = self.update(batch)
                    critic_losses.append(critic_loss.numpy())
                    actor_losses.append(actor_loss.numpy())
                    self.update_target_networks()
            
            if (episode + 1) % 10 == 0:
                print(f"Episode: {episode + 1:4d} | "
                      f"Score: {int(episode_return):5d} | "
                      f"Actor Loss: {actor_loss:.2f} | "
                      f"Critic Loss: {critic_loss:.2f}")
            returns.append(episode_return)

        
        return returns, critic_losses, actor_losses

In [None]:
env = gym.make(env_name)
agent = DDPG(env, gamma=0.99)
returns, critic_losses, actor_losses = agent.train(num_episodes=200)

In [None]:
from IPython.display import Video, display
import cv2

def render_pendulum(agent):
    num_timesteps = 200
    env = gym.make(env_name, render_mode='rgb_array')
    state, _ = env.reset()
    video_path = "Renders/DDPG_render.mp4"
    frame_width, frame_height = env.render().shape[1], env.render().shape[0]
    fourcc = cv2.VideoWriter_fourcc(*'avc1')
    out = cv2.VideoWriter(video_path, fourcc, 30, (frame_width, frame_height))
    for _ in range(num_timesteps):
        action = agent.get_action(state, 0)
        state, reward, done, _, _ = env.step(action)
        frame = env.render()
        frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
        out.write(frame_bgr)
    out.release()
    env.close()
    display(Video(video_path))

render_pendulum(agent)

### Twin Delayed DDPG (TD3)

In [None]:
from Lib.ReplayBuffer import ReplayBuffer3

# env_name = 'Pendulum-v1'
env_name = 'MountainCarContinuous-v0'

class TD3:
    def __init__(
        self,
        env,
        hidden_sizes=(300,),
        replay_size=int(1e4),
        gamma=0.99,
        decay=0.995,
        mu_lr=1e-3,
        q_lr=1e-3,
        batch_size=100,
        action_noise=0.1,
        target_noise=0.2,
        noise_clip=0.5,
        policy_delay=2,
        max_episode_length=200
    ):
        self.env = env
        self.gamma = gamma
        self.decay = decay
        self.batch_size = batch_size
        self.action_noise = action_noise
        self.target_noise = target_noise                # TD3-specific: noise added to target actions
        self.noise_clip = noise_clip                    # TD3-specific: clipping of target noise
        self.policy_delay = policy_delay                # TD3-specific: delayed policy updates
        self.max_episode_length = max_episode_length
        
        # Extract environment dimensions
        self.num_states = self.env.observation_space.shape[0]
        self.num_actions = self.env.action_space.shape[0]
        self.action_max = self.env.action_space.high[0]
        
        # Create networks
        self.actor = ActorNetwork(env, hidden_sizes, True)
        self.critic1 = CriticNetwork(env, hidden_sizes)
        self.critic2 = CriticNetwork(env, hidden_sizes)
        self.target_actor = ActorNetwork(env, hidden_sizes, True)
        self.target_critic1 = CriticNetwork(env, hidden_sizes)
        self.target_critic2 = CriticNetwork(env, hidden_sizes)
        
        # Build networks (initialize weights)
        dummy_state = tf.zeros([1, self.num_states])
        dummy_action = tf.zeros([1, self.num_actions])
        self.actor(dummy_state)
        x = tf.concat([dummy_state, dummy_action], axis=-1)
        self.critic1(x)
        self.critic2(x)
        self.target_actor(dummy_state)
        self.target_critic1(x)
        self.target_critic2(x)
        
        # Copy weights to target networks
        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic1.set_weights(self.critic1.get_weights())
        self.target_critic2.set_weights(self.critic2.get_weights())
        
        # Create optimizers
        self.actor_optimizer = tf.keras.optimizers.Adam(mu_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(q_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(q_lr)
        
        # Create replay buffer
        self.replay_buffer = ReplayBuffer3(replay_size)
        
        # Initialize step counter for delayed policy updates
        self.total_it = 0
        
    def get_action(self, s, noise_scale):
        a = self.actor(tf.convert_to_tensor(s.reshape(1,-1), dtype=tf.float32))
        a = a.numpy()[0]
        a += noise_scale * np.random.randn(self.num_actions)
        return np.clip(a, -self.action_max, self.action_max)
    
    # @tf.function
    def update(self, batch):
        states = tf.convert_to_tensor(batch['s'], dtype=tf.float32)
        states_next = tf.convert_to_tensor(batch['s2'], dtype=tf.float32)
        actions = tf.convert_to_tensor(batch['a'], dtype=tf.float32)
        rewards = tf.convert_to_tensor(batch['r'], dtype=tf.float32)
        dones = tf.convert_to_tensor(batch['d'], dtype=tf.float32)
        
        # Add noise to target actions
        noise = tf.random.normal(tf.shape(actions), stddev=self.target_noise)
        noise = tf.clip_by_value(noise, -self.noise_clip, self.noise_clip)
        
        target_actions = self.target_actor(states_next)
        target_actions = tf.clip_by_value(
            target_actions + noise,
            -self.action_max,
            self.action_max
        )
        
        # Get minimum Q-value between two critics
        x = tf.concat([states_next, target_actions], axis=-1)
        target_q1 = self.target_critic1(x)
        target_q2 = self.target_critic2(x)
        target_q = tf.minimum(target_q1, target_q2)
        
        # Q targets
        q_target = rewards + self.gamma * (1 - dones) * target_q
        
        # Update first critic
        with tf.GradientTape() as tape:
            x = tf.concat([states, actions], axis=-1)
            q1 = self.critic1(x)
            critic1_loss = tf.reduce_mean((q1 - q_target)**2)
        
        critic1_gradients = tape.gradient(critic1_loss, self.critic1.trainable_variables)
        self.critic1_optimizer.apply_gradients(
            zip(critic1_gradients, self.critic1.trainable_variables)
        )
        
        # Update second critic
        with tf.GradientTape() as tape:
            x = tf.concat([states, actions], axis=-1)
            q2 = self.critic2(x)
            critic2_loss = tf.reduce_mean((q2 - q_target)**2)
        
        critic2_gradients = tape.gradient(critic2_loss, self.critic2.trainable_variables)
        self.critic2_optimizer.apply_gradients(
            zip(critic2_gradients, self.critic2.trainable_variables)
        )
        
        # Delayed policy updates
        if self.total_it % self.policy_delay == 0:
            # Update actor
            with tf.GradientTape() as tape:
                actor_actions = self.actor(states)
                x = tf.concat([states, actor_actions], axis=-1)
                actor_loss = -tf.reduce_mean(self.critic1(x))
            
            actor_gradients = tape.gradient(actor_loss, self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_gradients, self.actor.trainable_variables)
            )
            
            # Update target networks
            self.update_target_networks()
        else:
            actor_loss = tf.constant(0.0)
        
        return critic1_loss, critic2_loss, actor_loss
    
    def update_target_networks(self):
        # Update target networks using soft update
        for target, main in zip(self.target_actor.variables, self.actor.variables):
            target.assign(self.decay * target + (1 - self.decay) * main)
        
        for target, main in zip(self.target_critic1.variables, self.critic1.variables):
            target.assign(self.decay * target + (1 - self.decay) * main)
            
        for target, main in zip(self.target_critic2.variables, self.critic2.variables):
            target.assign(self.decay * target + (1 - self.decay) * main)
    
    def train(self, num_episodes):
        returns = []
        # test_returns = []
        critic1_losses = []
        critic2_losses = []
        actor_losses = []

        print(f"Using random actions for the initial {self.replay_buffer.max_size} steps...")

        for episode in range(num_episodes):
            state, episode_return, episode_length = self.env.reset()[0], 0, 0
            done = False

            while not (done or episode_length == self.max_episode_length):
                # Use agent's actions only after buffer has enough samples
                if len(self.replay_buffer) >= self.replay_buffer.max_size: #self.batch_size:
                    action = self.get_action(state, self.action_noise)
                else:
                    action = self.env.action_space.sample()
                
                # Take action in environment
                next_state, reward, done, _, _ = self.env.step(action)
                episode_return += reward
                episode_length += 1
                
                # Store transition
                done_store = False if episode_length == self.max_episode_length else done
                self.replay_buffer.store(state, action, reward, next_state, done_store)
                
                if len(self.replay_buffer) == self.replay_buffer.max_size-1: #self.batch_size:
                    print(f"Memory full. Performing agent actions from now on.")
                
                # Update state
                state = next_state
                
                # Update networks if buffer has enough samples
                if len(self.replay_buffer) >= self.batch_size:
                    batch = self.replay_buffer.sample_batch(self.batch_size)
                    critic1_loss, critic2_loss, actor_loss = self.update(batch)
                    critic1_losses.append(critic1_loss.numpy())
                    critic2_losses.append(critic2_loss.numpy())
                    actor_losses.append(actor_loss.numpy())
                    self.total_it += 1
            
            if (episode + 1) % 10 == 0:
                print(f"Episode: {episode + 1:4d} | "
                      f"Score: {int(episode_return):5d} | "
                      f"Memory: {len(self.replay_buffer):5d} | "
                      f"Actor Loss: {actor_loss.numpy():.2f} | "
                      f"Critic 1 Loss: {critic1_loss.numpy():.2f} | "
                      f"Critic 2 Loss: {critic2_loss.numpy():.2f}")
                
            returns.append(episode_return)
        
        return returns, critic1_losses, critic2_losses, actor_losses

In [None]:
env = gym.make(env_name)
agent = TD3(env,gamma=0.99,policy_delay=2,target_noise=0.2,noise_clip=0.5)
returns, critic1_losses, critic2_losses, actor_losses = agent.train(num_episodes=200)

In [None]:
from IPython.display import Video, display
import cv2

def render_pendulum(agent):
    num_timesteps = 200
    env = gym.make(env_name, render_mode='rgb_array')
    state, _ = env.reset()
    video_path = "Renders/TD3_render.mp4"
    frame_width, frame_height = env.render().shape[1], env.render().shape[0]
    fourcc = cv2.VideoWriter_fourcc(*'avc1')
    out = cv2.VideoWriter(video_path, fourcc, 30, (frame_width, frame_height))
    for _ in range(num_timesteps):
        action = agent.get_action(state, 0)
        state, reward, done, _, _ = env.step(action)
        frame = env.render()
        frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)  # Convert RGB to BGR for OpenCV
        out.write(frame_bgr)
    out.release()
    env.close()
    display(Video(video_path))

render_pendulum(agent)