# Actor-Critic Methods

#### Refs: 
- [Sharif University of Technology - Deep Reinforcement Learning (Fall 2024) - Dr.A.Emami and M.Narimani](https://github.com/mnarimani/DRL_Fall2024_SUT)

In [3]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow import keras
from tensorflow.keras import initializers

from Lib.RBFFeature import FeatureTransformer
from Lib.ActorCritic import ActorNetwork

env_name = 'Pendulum-v1'

In [4]:
class ValueModel(keras.Model):
    def __init__(self):
        super(ValueModel, self).__init__()
        # initializer = #TODO: Add a kernel initializer for NN's initial weights
        self.output_layer = keras.layers.Dense(1, activation='linear')

    def call(self, inputs):
        x = inputs
        return tf.squeeze(self.output_layer(x))


In [5]:
def play_one_episode(env, feature_transformer, policy_model, value_model, gamma, max_steps):
    observation, _ = env.reset()
    done = False
    totalreward = 0
    iters = 0
    
    while not done and iters < max_steps:
        state = feature_transformer.transform(np.array([observation]))
        action = policy_model.sample_action(state)
        prev_observation = observation
        observation, reward, done, trunc, _ = env.step(action.numpy()[0])
        done = done or trunc
        
        totalreward += reward
        if np.isnan(observation).any():
            pass
        
        next_state = feature_transformer.transform(np.array([observation]))
        V_next = value_model(next_state)
        G = reward + gamma * V_next
        advantage = G - value_model(state)

        with tf.GradientTape() as tape:
            mean, stdv = policy_model(state)
            dist = tfp.distributions.Normal(mean, stdv)
            log_prob = dist.log_prob(action)
            loss = -tf.reduce_sum(advantage * log_prob + 0.01 * dist.entropy()) # TODO: try without including the regularization term
            tf.debugging.assert_all_finite(loss, "Loss is NaN/Inf")

        grads = tape.gradient(loss, policy_model.trainable_variables)
        policy_model.optimizer.apply_gradients(zip(grads, policy_model.trainable_variables))
        # for var in policy_model.trainable_variables:
        #     tf.debugging.check_numerics(var, f"NaN in {var.name}")
        if np.isnan(policy_model.trainable_variables[-1].value.numpy()[0]):
            print(f"b_std = {policy_model.trainable_variables[-1].value.numpy()[0]}, grad = {grads[-1].numpy()}")
        
        with tf.GradientTape() as tape:
            value_pred = value_model(state)
            value_loss = tf.reduce_sum(tf.square(G - value_pred))
        
        grads = tape.gradient(value_loss, value_model.trainable_variables)
        value_model.optimizer.apply_gradients(zip(grads, value_model.trainable_variables))

        iters += 1
    
    mse_v = value_loss.numpy()
    mae_pi = loss.numpy()
    return totalreward, mse_v, mae_pi

In [6]:
def run_training(env, ft, num_episodes=150, lr_pi=1e-4, lr_val=1e-3, max_steps=200):
    policy_model = ActorNetwork(env=env, hidden_layer_sizes=(), mean_layer_activation_function='linear')
    value_model = ValueModel()
    discount_rate = 0.96
    
    episode_rewards = []
    episode_mse_v = []
    episode_mae_pi = []
    
    policy_model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr_pi))
    value_model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr_val))
    
    for n in range(num_episodes):
        total_reward, mse_v, mae_pi = play_one_episode(env, ft, policy_model, value_model, discount_rate, max_steps)
        episode_rewards.append(total_reward)
        episode_mse_v.append(mse_v)
        episode_mae_pi.append(mae_pi)
        if (n+1) % 10 == 0:
            print(f"Episode: {n+1:4d} | "
                  f"Score: {int(total_reward):5d} | "
                  f"Avg reward: {int(sum(episode_rewards)/len(episode_rewards)):5d} | "
                  f"Policy MAE: {mae_pi:.2f} | "
                  f"Value MSE: {mse_v:.2f}")

        # TODO: Add a convergence criteria

    episode_rewards = np.array(episode_rewards)
    episode_mse_v = np.array(episode_mse_v)
    episode_mae_pi = np.array(episode_mae_pi)
    
    return episode_rewards, episode_mse_v, episode_mae_pi, policy_model, value_model

In [None]:
num_episodes = 150
max_steps = 200
num_components = 100
env = gym.make(env_name)
feature_transformer = FeatureTransformer(env, components_gammas=((num_components, 5),
                                                                 (num_components, 2),
                                                                 (num_components, 1),
                                                                 (num_components, 0.5)), 
                                         n_samples=10000)

rewards, mae_v, mae_pi, policy_model, value_model = run_training(
    env,
    feature_transformer,
    num_episodes,
    lr_pi=1e-3,
    lr_val=1e-1,
    max_steps=max_steps)

In [None]:
from IPython.display import Video, display
import cv2

def render_pendulum(ft, agent):
    env = gym.make(env_name, render_mode='rgb_array')
    state, _ = env.reset()
    video_path = f"Renders/AC_render_ENV-{env_name}.mp4"
    frame_width, frame_height = env.render().shape[1], env.render().shape[0]
    fourcc = cv2.VideoWriter_fourcc(*'avc1')
    out = cv2.VideoWriter(video_path, fourcc, 30, (frame_width, frame_height))
    for _ in range(max_steps):
        state = ft.transform(np.array([state]))
        action = agent.sample_action(state)
        state, _, done, _, _ = env.step([action.numpy()])
        frame = env.render()
        frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
        out.write(frame_bgr)
    out.release()
    env.close()
    display(Video(video_path))

render_pendulum(feature_transformer, policy_model)

## A3C : Asynchronous Advantage Actor Critic

In [7]:
import threading
from Lib.ActorCritic import ActorCritic
# Environment setup
env = gym.make(env_name)
state_shape = env.observation_space.shape[0]
action_shape = env.action_space.shape[0]
action_bound = [env.action_space.low, env.action_space.high]

# Hyperparameters
num_workers = 8 # cpu_count()
num_episodes = 2000
num_timesteps = 200
global_net_scope = 'Global_Net'
update_global = 10
gamma = 0.90
beta = 0.01

class A3CWorker:
    def __init__(self, name, global_ac):
        self.env = gym.make(env_name)
        self.name = name
        self.AC = ActorCritic(self.env, name, global_ac)
    
    def work(self):
        global global_rewards, global_episodes
        total_step = 1
        
        while global_episodes < num_episodes:
            state, _ = self.env.reset()
            batch_states, batch_actions, batch_rewards = [], [], []
            Return = 0
            
            for t in range(num_timesteps):               
                action = self.AC.get_action(state)
                next_state, reward, done, _, _ = self.env.step(action)
                
                done = True if t == num_timesteps - 1 else False
                Return += reward
                
                batch_states.append(state)
                batch_actions.append(action)
                batch_rewards.append(reward)
                
                if total_step % update_global == 0 or done:
                    if done:
                        target = 0
                    else:
                        target = self.AC.critic(tf.convert_to_tensor([next_state], dtype=tf.float32))
                        target = target.numpy()[0, 0]
                    
                    batch_target_value = []
                    for reward in batch_rewards[::-1]:
                        target = reward + gamma * target
                        batch_target_value.append(target)
                    batch_target_value.reverse()
                    
                    self.AC.update(np.vstack(batch_states),
                                   np.vstack(batch_actions),
                                   np.vstack(batch_target_value))
                    
                    batch_states, batch_actions, batch_rewards = [], [], []
                
                state = next_state
                total_step += 1
                
                if done:
                    global_rewards.append(Return)
                    global_episodes += 1
                    if (global_episodes+1) % 10 == 0:
                        print(
                            self.name,
                            "Epispde:", global_episodes+1,
                            "| Reward: %i" % global_rewards[-1]
                            )
                    break

# Training setup
global_rewards = []
global_episodes = 0

# Initialize networks
global_ac = ActorCritic(env, global_net_scope)
workers = [A3CWorker(f'W_{i}', global_ac) for i in range(num_workers)]

# Start worker threads
worker_threads = []
for worker in workers:
    thread = threading.Thread(target=worker.work)
    thread.start()
    worker_threads.append(thread)
    # worker.work()
# Wait for all threads to complete
for thread in worker_threads:
    thread.join()

W_4 Epispde: 10 | Reward: -1386
W_3 Epispde: 20 | Reward: -1566
W_6 Epispde: 30 | Reward: -1322
W_0 Epispde: 40 | Reward: -975
W_4 Epispde: 50 | Reward: -1363
W_3 Epispde: 60 | Reward: -1507
W_6 Epispde: 70 | Reward: -1496
W_0 Epispde: 80 | Reward: -1451
W_4 Epispde: 90 | Reward: -1422
W_7 Epispde: 100 | Reward: -1244
W_6 Epispde: 110 | Reward: -1518
W_0 Epispde: 120 | Reward: -986
W_4 Epispde: 130 | Reward: -1317
W_7 Epispde: 140 | Reward: -1523
W_3 Epispde: 150 | Reward: -1253
W_5 Epispde: 160 | Reward: -1699
W_4 Epispde: 170 | Reward: -1427
W_7 Epispde: 180 | Reward: -1133
W_3 Epispde: 190 | Reward: -1269
W_5 Epispde: 200 | Reward: -1426
W_4 Epispde: 210 | Reward: -1426
W_7 Epispde: 220 | Reward: -1388
W_3 Epispde: 230 | Reward: -1087
W_5 Epispde: 240 | Reward: -1221
W_4 Epispde: 250 | Reward: -1448
W_7 Epispde: 260 | Reward: -1532
W_3 Epispde: 270 | Reward: -1280
W_5 Epispde: 280 | Reward: -1181
W_4 Epispde: 290 | Reward: -1494
W_7 Epispde: 300 | Reward: -1318
W_3 Epispde: 310 | Re

In [8]:
from IPython.display import Video, display
import cv2

def render_pendulum(policy_model):
    env = gym.make(env_name, render_mode='rgb_array')
    state, _ = env.reset()

    video_path = "Renders/A3C_render.mp4"
    frame_width, frame_height = env.render().shape[1], env.render().shape[0]
    fourcc = cv2.VideoWriter_fourcc(*'avc1')
    out = cv2.VideoWriter(video_path, fourcc, 30, (frame_width, frame_height))
    for _ in range(num_timesteps):
        action = policy_model.get_action(state)
        state, reward, done, _, _ = env.step(action)
        frame = env.render()
        frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)  # Convert RGB to BGR for OpenCV
        out.write(frame_bgr)
    out.release()
    env.close()
    display(Video(video_path))

render_pendulum(global_ac)

  from pkg_resources import resource_stream, resource_exists


## A2C : (Synchronous) Advantage Actor Critic

In [None]:
from queue import Queue
import threading
import gymnasium as gym
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp

from Lib.ActorCritic import ActorNetwork

env_name = 'Pendulum-v1'

from Lib.ActorCritic import ActorNetwork, CriticNetwork, GlobalAgent
# Environment setup
env = gym.make(env_name)
state_shape = env.observation_space.shape[0]
action_shape = env.action_space.shape[0]
action_bound = [env.action_space.low, env.action_space.high]

# Hyperparameters
num_workers = 8 # cpu_count()
num_episodes = 2000
num_timesteps = 200
global_net_scope = 'Global_Net'
update_global = 10
gamma = 0.90
beta = 0.01

class A2CWorker:
    def __init__(self, name, global_agent, gradient_queue):
        self.env = gym.make(env_name)
        self.name = name
        self.global_agent = global_agent
        self.gradient_queue = gradient_queue
        
        # Local networks
        self.actor = ActorNetwork(env, (250, 250))
        self.critic = CriticNetwork(env)
        
        self.actor.build_networks()
        self.critic.build_networks()
        
        self.sync_with_global()
    
    def sync_with_global(self):
        """Synchronize local networks with global networks"""
        self.actor.set_weights(self.global_agent.actor.get_weights())
        self.critic.set_weights(self.global_agent.critic.get_weights())
    
    def get_action(self, state):
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        mean, stdev = self.actor(state)
        stdev = stdev + 1e-8
        
        normal_dist = tf.random.normal(shape=mean.shape)
        action = mean + stdev * normal_dist
        action = tf.clip_by_value(action, action_bound[0], action_bound[1])
        
        return action[0]
    
    def compute_gradients(self, states, actions, target_values):
        states = tf.convert_to_tensor(states, dtype=tf.float32)
        actions = tf.convert_to_tensor(actions, dtype=tf.float32)
        target_values = tf.convert_to_tensor(target_values, dtype=tf.float32)
        
        with tf.GradientTape(persistent=True) as tape:
            # Actor forward pass
            mean, variance = self.actor(states)
            mean = mean * action_bound[1]
            variance = variance + 1e-4
            
            # Create normal distribution
            dist = tfp.distributions.Normal(mean, tf.sqrt(variance))
            log_prob = tf.reduce_sum(dist.log_prob(actions), axis=1, keepdims=True)
            entropy = tf.reduce_sum(dist.entropy(), axis=1, keepdims=True)
            
            # Critic forward pass
            values = self.critic(states)
            td_error = target_values - values
            
            # Define losses
            actor_loss = -tf.reduce_mean(log_prob * tf.stop_gradient(td_error) + beta * entropy)
            critic_loss = tf.reduce_mean(tf.square(td_error))
        
        # Compute gradients
        actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables)
        critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables)
        
        # Clip gradients
        actor_grads = [tf.clip_by_norm(grad, 40) for grad in actor_grads if grad is not None]
        critic_grads = [tf.clip_by_norm(grad, 40) for grad in critic_grads if grad is not None]
        
        return actor_grads, critic_grads
        
    
    def work(self, coordinator):
        global global_rewards, global_episodes
        episode = 0
        
        while episode < num_episodes and not coordinator.should_stop():
            state, _ = self.env.reset()
            batch_states, batch_actions, batch_rewards = [], [], []
            episode_reward = 0
            
            for t in range(num_timesteps):
                action = self.get_action(state)
                next_state, reward, done, _, _ = self.env.step(action)
                
                done = True if t == num_timesteps - 1 else False
                episode_reward += reward
                
                batch_states.append(state)
                batch_actions.append(action)
                batch_rewards.append(reward)
                
                if done:
                    if done:
                        target = 0
                    else:
                        target = self.critic(tf.convert_to_tensor([next_state], dtype=tf.float32))
                        target = target.numpy()[0, 0]
                    
                    # Target values
                    batch_target_value = []
                    for reward in batch_rewards[::-1]:
                        target = reward + gamma * target
                        batch_target_value.append(target)
                    batch_target_value.reverse()
                    
                    # Compute gradients and add to queue
                    gradients = self.compute_gradients(
                        np.vstack(batch_states),
                        np.vstack(batch_actions),
                        np.vstack(batch_target_value)
                    )
                    self.gradient_queue.put(gradients)
                    
                    # Update episode stats
                    global_rewards.append(episode_reward)
                    episode += 1
                    
                    if (episode+1) % 10 == 0:
                        print(
                            self.name,
                            "Epispde:", episode+1,
                            "| Reward: %i" % global_rewards[-1]
                            )
                    break
                
                state = next_state
                
# Initialize global agent and gradient queue
global_agent = GlobalAgent(env, (250, 250))
gradient_queue = Queue()
global_rewards = []
global_episodes = 0

# Create coordinator for managing workers
coordinator = tf.train.Coordinator()

# Initialize workers
workers = [A2CWorker(f'W_{i}', global_agent, gradient_queue) 
          for i in range(num_workers)]

# Create worker threads
worker_threads = []
for worker in workers:
    thread = threading.Thread(target=worker.work, args=(coordinator,))
    thread.start()
    worker_threads.append(thread)
    # worker.work(coordinator)
    
try:
    while global_episodes < num_episodes:
        # Wait for all workers to submit their gradients
        all_gradients = []
        for _ in range(num_workers):
            gradients = gradient_queue.get()
            all_gradients.append(gradients)
        
        # Perform synchronous update
        global_agent.update(all_gradients)
        
        # Synchronize all workers with updated global networks
        for worker in workers:
            worker.sync_with_global()
        
        global_episodes += num_workers
        
except Exception as e:
    print(f"Training error: {e}")
    coordinator.request_stop()

coordinator.request_stop()
coordinator.join(worker_threads)

In [None]:
from IPython.display import Video, display
import cv2

def render_pendulum(policy_model):
    env = gym.make(env_name, render_mode='rgb_array')
    state, _ = env.reset()
    video_path = "Renders/A2C_render.mp4"
    frame_width, frame_height = env.render().shape[1], env.render().shape[0]
    fourcc = cv2.VideoWriter_fourcc(*'avc1')
    out = cv2.VideoWriter(video_path, fourcc, 30, (frame_width, frame_height))
    for _ in range(num_timesteps):
        action = policy_model.get_action(state)
        state, reward, done, _, _ = env.step(action)
        frame = env.render()
        frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)  # Convert RGB to BGR for OpenCV
        out.write(frame_bgr)
    out.release()
    env.close()
    display(Video(video_path))

render_pendulum(workers[0])