# Soft Policy Gradient Methods

#### Refs: 
- [Sharif University of Technology - Deep Reinforcement Learning (Fall 2024) - Dr.A.Emami and M.Narimani](https://github.com/mnarimani/DRL_Fall2024_SUT)

In [1]:
import tensorflow as tf
from keras.optimizers import Adam
from tensorflow import keras
import numpy as np
import gymnasium as gym


from Lib.ReplayBuffer import ReplayBuffer2
from Lib.ActorCritic import ActorNetwork, CriticNetwork

env_name = 'Pendulum-v1'






In [None]:
class SACAgent:
    def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8],
            env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005,
            layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer2(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        # Initialize temperature parameter (alpha) and its optimizer
        self.log_alpha = tf.Variable(tf.math.log(0.2), dtype=tf.float32)
        self.alpha_T = tf.exp(self.log_alpha)
        self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4)

        # Optional: Set target entropy (can also be set in learn method)
        self.target_entropy = -tf.cast(self.n_actions, dtype=tf.float32)

        self.actor = ActorNetwork(env, (250, 250), name='actor')
        self.critic_1 = CriticNetwork(env, (256, 256), name='critic_1')
        self.critic_2 = CriticNetwork(env, (256, 256), name='critic_2')
        self.value = CriticNetwork(env, (256, 256), name='value')
        self.target_value = CriticNetwork(env, (256, 256), name='target_value')

        self.actor.compile(optimizer=Adam(learning_rate=alpha))
        self.critic_1.compile(optimizer=Adam(learning_rate=beta))
        self.critic_2.compile(optimizer=Adam(learning_rate=beta))
        self.value.compile(optimizer=Adam(learning_rate=beta))
        self.target_value.compile(optimizer=Adam(learning_rate=beta))

        self.scale = reward_scale
        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        state = tf.convert_to_tensor([observation])
        actions, _ = self.actor.sample_normal(state, reparameterize=False)

        return actions[0]

    def remember(self, state, action, reward, new_state, done):
        self.memory.store(state, action, reward, new_state, done)

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        weights = []
        targets = self.target_value.weights
        for i, weight in enumerate(self.value.weights):
            weights.append(weight * tau + targets[i]*(1-tau))

        self.target_value.set_weights(weights)

    def learn(self):
        if self.memory.m_cntr < self.batch_size:
            return 0, 0, 0, 0

        state, action, reward, new_state, done = \
                self.memory.sample(self.batch_size)

        states = tf.convert_to_tensor(state, dtype=tf.float32)
        states_ = tf.convert_to_tensor(new_state, dtype=tf.float32)
        rewards = tf.convert_to_tensor(reward, dtype=tf.float32)
        actions = tf.convert_to_tensor(action, dtype=tf.float32)

        # Update temperature parameter

        # target_entropy = -tf.cast(tf.shape(action)[-1], dtype=tf.float32)
        # with tf.GradientTape() as tape:
        #     _, log_probs = self.actor.sample_normal(states, reparameterize=True)
        #     log_probs = tf.squeeze(log_probs, 1)
        #     alpha_loss = -tf.reduce_mean(
        #         self.log_alpha * tf.stop_gradient(log_probs + target_entropy))
        #
        # alpha_gradient = tape.gradient(alpha_loss, [self.log_alpha])
        # self.alpha_optimizer.apply_gradients(zip(alpha_gradient, [self.log_alpha]))
        # self.alpha_T = tf.exp(self.log_alpha)

        with tf.GradientTape() as tape:
            value = self.value(states)
            value_ = self.target_value(states_)

            current_policy_actions, log_probs = self.actor.sample_normal(states,
                                                        reparameterize=False)
            log_probs = tf.squeeze(log_probs,1)
            x = tf.concat([states, current_policy_actions], axis=-1)
            q1_new_policy = self.critic_1(x)
            q2_new_policy = self.critic_2(x)
            critic_value = tf.squeeze(tf.math.minimum(q1_new_policy, q2_new_policy))

            value_target = critic_value - self.alpha_T * log_probs
            value_loss = 0.5 * keras.losses.MSE(value, value_target)

        value_network_gradient = tape.gradient(value_loss, 
                                                self.value.trainable_variables)
        self.value.optimizer.apply_gradients(zip(
                       value_network_gradient, self.value.trainable_variables))

        with tf.GradientTape() as tape:
            new_policy_actions, log_probs = self.actor.sample_normal(states,
                                                reparameterize=True)
            log_probs = tf.squeeze(log_probs, 1)
            x = tf.concat([states, new_policy_actions], axis=-1)
            q1_new_policy = self.critic_1(x)
            q2_new_policy = self.critic_2(x)
            critic_value = tf.math.minimum(q1_new_policy, q2_new_policy)
        
            actor_loss = self.alpha_T * log_probs - critic_value
            actor_loss = tf.math.reduce_mean(actor_loss)
            log_probs_2 = log_probs

        actor_network_gradient = tape.gradient(actor_loss, 
                                            self.actor.trainable_variables)
        self.actor.optimizer.apply_gradients(zip(
                        actor_network_gradient, self.actor.trainable_variables))
        

        with tf.GradientTape(persistent=True) as tape:
            q_hat = self.scale*reward + self.gamma*value_*(1-done)
            x = tf.concat([state, action], axis=-1)
            q1_old_policy = self.critic_1(x)
            q2_old_policy = self.critic_2(x)
            critic_1_loss = 0.5 * keras.losses.MSE(q1_old_policy, q_hat)
            critic_2_loss = 0.5 * keras.losses.MSE(q2_old_policy, q_hat)
    
        critic_1_network_gradient = tape.gradient(critic_1_loss,
                                        self.critic_1.trainable_variables)
        critic_2_network_gradient = tape.gradient(critic_2_loss,
            self.critic_2.trainable_variables)

        self.critic_1.optimizer.apply_gradients(zip(
            critic_1_network_gradient, self.critic_1.trainable_variables))
        self.critic_2.optimizer.apply_gradients(zip(
            critic_2_network_gradient, self.critic_2.trainable_variables))

        self.update_network_parameters()

        return value_loss, actor_loss, critic_1_loss, critic_2_loss

In [None]:
env = gym.make(env_name)
agent = SACAgent(input_dims=env.observation_space.shape, env=env,
              n_actions=env.action_space.shape[0])
n_episodes = 150

score_history = []
value_losses = []
actor_losses = []
critic_1_losses = []
critic_2_losses = []

for i in range(n_episodes):
    observation, _ = env.reset()
    done = False
    score = 0
    steps = 0
    while not done and steps < 200:
        steps += 1
        action = agent.choose_action(observation)

        observation_, reward, done, info, _ = env.step(action.numpy() * 2)  # action output is in [-1,1]
        score += reward
        agent.remember(observation, action, reward, observation_, done)
        value_loss, actor_loss, critic_1_loss, critic_2_loss = agent.learn()
        observation = observation_
    score_history.append(score)

    value_losses.append(value_loss)
    actor_losses.append(actor_loss)
    critic_1_losses.append(critic_1_loss)
    critic_2_losses.append(critic_2_loss)

    avg_score = np.mean(score_history[-100:])

    if (i+1) % 5 == 0:
        print(f"Episode: {i+1:4d} | "
              f"Score: {int(score):5d} | "
              f"Avg Score: {int(avg_score):5d} | "
              f"Actor Loss: {actor_loss:.2f} | "
              f"Critic 1 Loss: {critic_1_loss:.2f} | "
              f"Critic 2 Loss: {critic_2_loss:.2f} | "
              f"Value Loss: {value_loss:.2f}")

In [None]:
from IPython.display import Video, display
import cv2

def render_pendulum(agent):
    num_timesteps = 200
    env = gym.make(env_name, render_mode='rgb_array')
    state, _ = env.reset()
    video_path = "Renders/SAC_render.mp4"
    frame_width, frame_height = env.render().shape[1], env.render().shape[0]
    fourcc = cv2.VideoWriter_fourcc(*'avc1')
    out = cv2.VideoWriter(video_path, fourcc, 30, (frame_width, frame_height))
    for _ in range(num_timesteps):
        # action = env.action_space.sample()
        action = agent.choose_action(state)
        state, reward, done, _, _ = env.step(action)
        frame = env.render()
        frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)  # Convert RGB to BGR for OpenCV
        out.write(frame_bgr)
    out.release()
    env.close()
    display(Video(video_path))

render_pendulum(agent)

### PPO :   Proximal Policy Optimization (PPO-Clip)

In [2]:
import tensorflow_probability as tfp

env = gym.make(env_name)
S_DIM = env.observation_space.shape[0]
A_DIM = env.action_space.shape[0]
A_BOUND = [env.action_space.low[0], env.action_space.high[0]]
EP_MAX = 2000
EP_LEN = 200
GAMMA = 0.9
A_LR = 0.0001
C_LR = 0.0005
BATCH = 64
A_UPDATE_STEPS = 10
C_UPDATE_STEPS = 10
EPSILON = 0.2 # Clipped surrogate objective

class PPOActorNetwork(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.dense1 = tf.keras.layers.Dense(256, activation='relu')
        self.mu = tf.keras.layers.Dense(A_DIM, activation='tanh')
        self.sigma = tf.keras.layers.Dense(A_DIM, activation='softplus')
        
    def call(self, state):
        x = self.dense1(state)
        mu = self.mu(x) * A_BOUND[1]
        sigma = self.sigma(x)
        return mu, sigma + 1e-4
    
class CriticNetwork(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.dense1 = tf.keras.layers.Dense(256, activation='relu')
        self.value = tf.keras.layers.Dense(1)
        
    def call(self, state):
        x = self.dense1(state)
        return self.value(x)
        
class PPO:
    def __init__(self):
        self.actor = PPOActorNetwork()
        self.actor_old = PPOActorNetwork()
        self.critic = CriticNetwork()
        
        # Build models with dummy input
        dummy_state = tf.random.normal((1, S_DIM))
        self.actor(dummy_state)
        self.actor_old(dummy_state)
        self.critic(dummy_state)
        
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=A_LR)
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=C_LR)
        
        # Add metrics tracking
        self.actor_loss_metric = tf.keras.metrics.Mean('actor_loss', dtype=tf.float32)
        self.critic_loss_metric = tf.keras.metrics.Mean('critic_loss', dtype=tf.float32)

    def update_old_actor(self):
        self.actor_old.set_weights(self.actor.get_weights())

    @tf.function
    def choose_action(self, state):
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        mu, sigma = self.actor(state)
        dist = tf.random.normal(shape=mu.shape)
        action = mu + sigma * dist
        return tf.clip_by_value(action[0], A_BOUND[0], A_BOUND[1])

    @tf.function
    def get_value(self, state):
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        return self.critic(state)[0, 0]

    @tf.function
    def actor_loss(self, states, actions, advantages):
        mu, sigma = self.actor(states)
        old_mu, old_sigma = self.actor_old(states)
        
        # Calculate probabilities
        dist = tfp.distributions.Normal(mu, sigma)
        old_dist = tfp.distributions.Normal(old_mu, old_sigma)
        
        ratio = tf.exp(dist.log_prob(actions) - old_dist.log_prob(actions))
        surr = ratio * advantages
        
        # Clipped surrogate objective
        clip_surr = tf.clip_by_value(ratio, 1.-EPSILON, 1.+EPSILON) * advantages
        
        return -tf.reduce_mean(tf.minimum(surr, clip_surr))

    @tf.function
    def critic_loss(self, states, discounted_rewards):
        values = self.critic(states)
        return tf.reduce_mean(tf.square(discounted_rewards - values))

    @tf.function
    def train_actor(self, states, actions, advantages):
        with tf.GradientTape() as tape:
            loss = self.actor_loss(states, actions, advantages)
        grads = tape.gradient(loss, self.actor.trainable_variables)
        self.actor_optimizer.apply_gradients(zip(grads, self.actor.trainable_variables))
        self.actor_loss_metric.update_state(loss)
        return loss

    @tf.function
    def train_critic(self, states, discounted_rewards):
        with tf.GradientTape() as tape:
            loss = self.critic_loss(states, discounted_rewards)
        grads = tape.gradient(loss, self.critic.trainable_variables)
        self.critic_optimizer.apply_gradients(zip(grads, self.critic.trainable_variables))
        self.critic_loss_metric.update_state(loss)
        return loss

    def update(self, states, actions, rewards):
        states = tf.convert_to_tensor(states, dtype=tf.float32)
        actions = tf.convert_to_tensor(actions, dtype=tf.float32)
        rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
        
        # Reset metrics
        self.actor_loss_metric.reset_state()
        self.critic_loss_metric.reset_state()
        
        # Calculate advantage
        values = self.critic(states)
        advantages = rewards - values
        
        # Update old actor
        self.update_old_actor()
        
        # Update actor
        for _ in range(A_UPDATE_STEPS):
            self.train_actor(states, actions, advantages)
            
        # Update critic
        for _ in range(C_UPDATE_STEPS):
            self.train_critic(states, rewards)
            
        # Return the average losses
        return {
            'actor_loss': self.actor_loss_metric.result().numpy(),
            'critic_loss': self.critic_loss_metric.result().numpy()
        }

In [3]:
ppo = PPO()
all_ep_r = []
all_actor_losses = []
all_critic_losses = []

for ep in range(EP_MAX):
    s = env.reset()[0]
    buffer_s, buffer_a, buffer_r = [], [], []
    ep_r = 0
    ep_actor_losses = []
    ep_critic_losses = []
    
    for t in range(EP_LEN):
        a = ppo.choose_action(s).numpy()
        s_, r, terminated, truncated, _ = env.step(a)
        
        buffer_s.append(s)
        buffer_a.append(a)
        buffer_r.append(r)
        
        s = s_
        ep_r += r

        # update ppo
        if (t + 1) % BATCH == 0 or t == EP_LEN - 1:
            v_s_ = ppo.get_value(s_).numpy()
            discounted_r = []
            for r in buffer_r[::-1]:
                v_s_ = r + GAMMA * v_s_
                discounted_r.append(v_s_)
            discounted_r.reverse()

            bs = np.vstack(buffer_s)
            ba = np.vstack(buffer_a)
            br = np.array(discounted_r)[:, np.newaxis]
            
            losses = ppo.update(bs, ba, br)
            ep_actor_losses.append(losses['actor_loss'])
            ep_critic_losses.append(losses['critic_loss'])
            buffer_s, buffer_a, buffer_r = [], [], []


    all_ep_r.append(ep_r)
    avg_score = np.mean(all_ep_r[-100:])
    
    avg_actor_loss = np.mean(ep_actor_losses) if ep_actor_losses else 0
    avg_critic_loss = np.mean(ep_critic_losses) if ep_critic_losses else 0
    all_actor_losses.append(avg_actor_loss)
    all_critic_losses.append(avg_critic_loss)

    if (ep+1) % 10 == 0:
        print(f"Episode: {ep+1:4d} | "
              f"Score: {int(ep_r):5d} | "
              f"Avg Score: {int(avg_score):5d} | "
              f"Actor Loss: {avg_actor_loss:.2f} | "
              f"Critic Loss: {avg_critic_loss:.2f}")

    # Check if solved
    if len(all_ep_r) >= 100 and avg_score >= -300:
        print(f'Problem solved in {ep+1} episodes')
        break

Episode:   10 | Score: -1355 | Avg Score: -1337 | Actor Loss: 15.57 | Critic Loss: 430.60
Episode:   20 | Score: -1228 | Avg Score: -1288 | Actor Loss: 1.56 | Critic Loss: 92.47
Episode:   30 | Score: -1174 | Avg Score: -1285 | Actor Loss: -0.12 | Critic Loss: 107.26
Episode:   40 | Score: -1224 | Avg Score: -1270 | Actor Loss: 10.12 | Critic Loss: 231.58
Episode:   50 | Score: -1270 | Avg Score: -1258 | Actor Loss: 4.40 | Critic Loss: 151.51
Episode:   60 | Score: -1222 | Avg Score: -1248 | Actor Loss: 0.78 | Critic Loss: 27.52
Episode:   70 | Score: -1038 | Avg Score: -1235 | Actor Loss: -4.51 | Critic Loss: 63.09
Episode:   80 | Score: -1194 | Avg Score: -1233 | Actor Loss: -0.44 | Critic Loss: 37.30
Episode:   90 | Score: -1051 | Avg Score: -1222 | Actor Loss: 0.88 | Critic Loss: 45.49
Episode:  100 | Score: -1068 | Avg Score: -1219 | Actor Loss: -3.71 | Critic Loss: 48.60
Episode:  110 | Score: -1325 | Avg Score: -1201 | Actor Loss: 0.37 | Critic Loss: 48.76
Episode:  120 | Score:

KeyboardInterrupt: 

In [None]:
from IPython.display import Video, display
import cv2

def render_pendulum(agent):
    num_timesteps = 400
    env = gym.make(env_name, render_mode='rgb_array')
    state, _ = env.reset()
    video_path = "Renders/PPO_render.mp4"
    frame_width, frame_height = env.render().shape[1], env.render().shape[0]
    fourcc = cv2.VideoWriter_fourcc(*'avc1')
    out = cv2.VideoWriter(video_path, fourcc, 30, (frame_width, frame_height))
    for _ in range(num_timesteps):
        # action = env.action_space.sample()
        action = agent.choose_action(state).numpy()
        state, reward, done, _, _ = env.step(action)
        frame = env.render()
        frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)  # Convert RGB to BGR for OpenCV
        out.write(frame_bgr)
    out.release()
    env.close()
    display(Video(video_path))

render_pendulum(ppo)