# Reinforcement Learning Guide - Continuation Part 2

In this continuation, we will explore more advanced Reinforcement Learning algorithms and techniques.

## 1. Proximal Policy Optimization (PPO)

Proximal Policy Optimization (PPO) is an advanced policy gradient method that uses a clipped surrogate objective to improve training stability.

In [None]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Initialize environment
env = gym.make('CartPole-v1')

# Set parameters
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.0003

# Build the policy network
policy_network = Sequential([
    Dense(128, input_dim=state_size, activation='relu'),
    Dense(128, activation='relu'),
    Dense(action_size, activation='softmax')
])
policy_network.compile(optimizer=Adam(learning_rate=learning_rate), loss='categorical_crossentropy')

# Build the value network
value_network = Sequential([
    Dense(128, input_dim=state_size, activation='relu'),
    Dense(128, activation='relu'),
    Dense(1, activation='linear')
])
value_network.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')

def choose_action(state):
    state = state.reshape([1, state_size])
    prob = policy_network.predict(state)[0]
    return np.random.choice(action_size, p=prob)

def discount_rewards(rewards, gamma=0.99):
    discounted_rewards = np.zeros_like(rewards)
    cumulative_rewards = 0
    for t in reversed(range(len(rewards))):
        cumulative_rewards = cumulative_rewards * gamma + rewards[t]
        discounted_rewards[t] = cumulative_rewards
    return discounted_rewards

# Training parameters
n_episodes = 500
gamma = 0.99
clip_ratio = 0.2

# Training loop
for episode in range(n_episodes):
    state = env.reset()
    states, actions, rewards, old_probs, values = [], [], [], [], []
    done = False
    total_reward = 0

    while not done:
        action = choose_action(state)
        prob = policy_network.predict(state.reshape([1, state_size]))[0]
        value = value_network.predict(state.reshape([1, state_size]))[0]
        
        next_state, reward, done, _ = env.step(action)
        
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        old_probs.append(prob)
        values.append(value)
        
        state = next_state
        total_reward += reward

        if done:
            values.append(value_network.predict(next_state.reshape([1, state_size]))[0])
            discounted_rewards = discount_rewards(rewards, gamma)
            advantages = np.array(discounted_rewards) - np.array(values[:-1])

            states = np.vstack(states)
            actions = np.array(actions)
            old_probs = np.vstack(old_probs)
            advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-10)
            
            actions_one_hot = np.zeros([len(actions), action_size])
            actions_one_hot[np.arange(len(actions)), actions] = 1
            
            with tf.GradientTape() as tape:
                prob = policy_network(states, training=True)
                prob = tf.reduce_sum(prob * actions_one_hot, axis=1)
                ratio = prob / old_probs
                clip_ratio = tf.clip_by_value(ratio, 1 - clip_ratio, 1 + clip_ratio)
                surrogate1 = ratio * advantages
                surrogate2 = clip_ratio * advantages
                policy_loss = -tf.reduce_mean(tf.minimum(surrogate1, surrogate2))
                
            grads = tape.gradient(policy_loss, policy_network.trainable_variables)
            policy_network.optimizer.apply_gradients(zip(grads, policy_network.trainable_variables))
            
            value_network.fit(states, discounted_rewards, verbose=0)
            
            print(f"Episode: {episode+1}, Total Reward: {total_reward}")

# Test the agent
state = env.reset()
done = False
total_reward = 0

while not done:
    state = state.reshape([1, state_size])
    action = np.argmax(policy_network.predict(state)[0])
    next_state, reward, done, _ = env.step(action)
    state = next_state
    total_reward += reward
    env.render()

print("Total reward:", total_reward)