# Reinforcement Learning Guide - Continuation

In this continuation, we will explore more advanced Reinforcement Learning algorithms and techniques.

## 1. Policy Gradient Methods

Policy Gradient methods are a type of Reinforcement Learning algorithm that directly parameterizes the policy and optimizes the parameters using gradient ascent.

### REINFORCE Algorithm

In [None]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Initialize environment
env = gym.make('CartPole-v1')

# Set parameters
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.001

# Build the policy network
policy_network = Sequential([
    Dense(24, input_dim=state_size, activation='relu'),
    Dense(24, activation='relu'),
    Dense(action_size, activation='softmax')
])
policy_network.compile(optimizer=Adam(learning_rate=learning_rate), loss='categorical_crossentropy')

def choose_action(state):
    state = state.reshape([1, state_size])
    prob = policy_network.predict(state)[0]
    return np.random.choice(action_size, p=prob)

def discount_rewards(rewards, gamma=0.99):
    discounted_rewards = np.zeros_like(rewards)
    cumulative_rewards = 0
    for t in reversed(range(len(rewards))):
        cumulative_rewards = cumulative_rewards * gamma + rewards[t]
        discounted_rewards[t] = cumulative_rewards
    return discounted_rewards

# Training parameters
n_episodes = 1000
gamma = 0.99

# Training loop
for episode in range(n_episodes):
    state = env.reset()
    states, actions, rewards = [], [], []
    done = False
    total_reward = 0

    while not done:
        action = choose_action(state)
        next_state, reward, done, _ = env.step(action)

        states.append(state)
        actions.append(action)
        rewards.append(reward)

        state = next_state
        total_reward += reward

        if done:
            discounted_rewards = discount_rewards(rewards, gamma)
            discounted_rewards -= np.mean(discounted_rewards)
            discounted_rewards /= np.std(discounted_rewards)

            states = np.vstack(states)
            actions = np.array(actions)
            advantages = discounted_rewards

            actions_one_hot = np.zeros([len(actions), action_size])
            actions_one_hot[np.arange(len(actions)), actions] = 1

            policy_network.fit(states, actions_one_hot, sample_weight=advantages, verbose=0)

            print(f"Episode: {episode+1}, Total Reward: {total_reward}")

# Test the agent
state = env.reset()
done = False
total_reward = 0

while not done:
    state = state.reshape([1, state_size])
    action = np.argmax(policy_network.predict(state)[0])
    next_state, reward, done, _ = env.step(action)
    state = next_state
    total_reward += reward
    env.render()

print("Total reward:", total_reward)

## 2. Actor-Critic Methods

Actor-Critic methods combine policy-based and value-based methods. The actor updates the policy, and the critic estimates the value function.

### A2C (Advantage Actor-Critic) Algorithm

In [None]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Initialize environment
env = gym.make('CartPole-v1')

# Set parameters
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.001

# Build the actor network
actor_network = Sequential([
    Dense(24, input_dim=state_size, activation='relu'),
    Dense(24, activation='relu'),
    Dense(action_size, activation='softmax')
])
actor_network.compile(optimizer=Adam(learning_rate=learning_rate), loss='categorical_crossentropy')

# Build the critic network
critic_network = Sequential([
    Dense(24, input_dim=state_size, activation='relu'),
    Dense(24, activation='relu'),
    Dense(1, activation='linear')
])
critic_network.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')

def choose_action(state):
    state = state.reshape([1, state_size])
    prob = actor_network.predict(state)[0]
    return np.random.choice(action_size, p=prob)

# Training parameters
n_episodes = 1000
gamma = 0.99

# Training loop
for episode in range(n_episodes):
    state = env.reset()
    states, actions, rewards, values = [], [], [], []
    done = False
    total_reward = 0

    while not done:
        action = choose_action(state)
        next_state, reward, done, _ = env.step(action)

        states.append(state)
        actions.append(action)
        rewards.append(reward)
        values.append(critic_network.predict(state)[0][0])

        state = next_state
        total_reward += reward

        if done:
            values.append(critic_network.predict(next_state.reshape([1, state_size]))[0][0])
            advantages, returns = [], []
            gae = 0
            for t in reversed(range(len(rewards))):
                delta = rewards[t] + gamma * values[t+1] - values[t]
                gae = delta + gamma * gae
                advantages.append(gae)
                returns.append(gae + values[t])
            advantages.reverse()
            returns.reverse()

            states = np.vstack(states)
            actions = np.array(actions)
            advantages = np.array(advantages)
            returns = np.array(returns)

            actions_one_hot = np.zeros([len(actions), action_size])
            actions_one_hot[np.arange(len(actions)), actions] = 1

            actor_network.fit(states, actions_one_hot, sample_weight=advantages, verbose=0)
            critic_network.fit(states, returns, verbose=0)

            print(f"Episode: {episode+1}, Total Reward: {total_reward}")

# Test the agent
state = env.reset()
done = False
total_reward = 0

while not done:
    state = state.reshape([1, state_size])
    action = np.argmax(actor_network.predict(state)[0])
    next_state, reward, done, _ = env.step(action)
    state = next_state
    total_reward += reward
    env.render()

print("Total reward:", total_reward)