<a href="https://colab.research.google.com/github/mohamedyosef101/101_learning_area/blob/area/Reinforcement%20Learning/03_deep_q_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Policy** Gradient

**Source:** [huggingface.co](https://huggingface.co/learn/deep-rl-course/unit4/hands-on)

# Set up

In [None]:
!pip install gymnasium pygame pyglet pickle5 pyyaml pyvirtualdisplay
!sudo apt-get update
!sudo apt-get install -y python-opengl
!apt install ffmpeg xvfb
!pip install git+https://github.com/ntasfi/PyGame-Learning-Environment.git
!pip install git+https://github.com/simoninithomas/gym-games

In [2]:
# virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7ca1a0315180>

# Import the packages

In [3]:
import numpy as np
import matplotlib.pyplot as plt

from collections import deque

import torch
from torch import nn
import torch.nn.functional as F
from torch import optim
from torch.distributions import Categorical as cat

import gym
import gym_pygame

from huggingface_hub import notebook_login
import imageio

print("Packages are ready!")

Packages are ready!


## Check GPU

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


# CartPole-v1

In [5]:
env_id = "CartPole-v1"

env = gym.make(env_id)
eval_env = gym.make(env_id)


# Get the state space and action space
states = env.observation_space.shape[0]
actions = env.action_space.n

print(f"""The State Space is: {states}
      \nThe Action Space is: {actions}""")

The State Space is: 4
      
The Action Space is: 2


  deprecation(
  deprecation(


# Policy

In [22]:
class Policy(nn.Module):
  def __init__(self, states, actions, horizon):
    super(Policy, self).__init__()

    # two fully connected layers
    self.fc1 = nn.Linear(states, horizon)
    self.fc2 = nn.Linear(horizon, actions)

  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = self.fc2(x)
    return F.softmax(x, dim=1)

  def act(self, state):
    state = torch.from_numpy(state).float().unsqueeze(0).to(device)
    probs = self.forward(state).cpu()
    m = cat(probs)
    action = m.sample()
    return action.item(), m.log_prob(action)

In [23]:
debug_policy = Policy(states, actions, 64).to(device)
debug_policy.act(env.reset())

(0, tensor([-0.6695], grad_fn=<SqueezeBackward1>))

# Reinforce Algorithm

In [28]:
def reinforce(policy, optimizer,
              n_training_episodes, max_t, gamma, print_every):
  # calculate the score
  scores_deque = deque(maxlen=100)
  scores = []

  for episode in range(1, n_training_episodes + 1):
    saved_log_probs = []
    rewards = []
    state = env.reset()

    for t in range(max_t):
      action, log_prob = policy.act(state)
      saved_log_probs.append(log_prob)
      state, reward, done, _ = env.step(action)
      rewards.append(reward)
      if done:
        break
    scores_deque.append(sum(rewards))
    scores.append(rewards)

    returns = deque(maxlen=max_t)
    n_steps = len(rewards)

    for t in range(n_steps)[::-1]:
      disc_return_t = returns[0] if len(returns) > 0 else 0
      returns.appendleft(gamma * disc_return_t + rewards[t])

    eps = np.finfo(np.float32).eps.item()

    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)

    policy_loss = []
    for log_prob, disc_return in zip(saved_log_probs, returns):
      policy_loss.append(-log_prob * disc_return)
    policy_loss = torch.cat(policy_loss).sum()

    optimizer.zero_grad()
    policy_loss.backward()
    optimizer.step()

    if episode % print_every == 0:
      print(f"Episode {episode} \tAverage Score: {np.mean(scores_deque):.2f}")

 # Train the Reinforce

In [25]:
cartpole_hyperparameters = {
    "horizon": 16,
    "n_training_episodes": 1000,
    "n_evaluation_episodes": 10,
    "max_t": 1000,
    "gamma": 1.0,
    "lr": 1e-2,
    "env_id": env_id,
    "state_space": states,
    "action_space": actions,
}

In [26]:
# Create policy and add it to the device

cartpole_policy = Policy(
    cartpole_hyperparameters["state_space"],
    cartpole_hyperparameters["action_space"],
    cartpole_hyperparameters["horizon"],
).to(device)

cartpole_optimizer = optim.Adam(cartpole_policy.parameters(),
                                lr=cartpole_hyperparameters["lr"])

In [29]:
scores = reinforce(
    cartpole_policy,
    cartpole_optimizer,
    cartpole_hyperparameters["n_training_episodes"],
    cartpole_hyperparameters["max_t"],
    cartpole_hyperparameters["gamma"],
    100,
)

Episode 100 	Average Score: 314.48
Episode 200 	Average Score: 461.93
Episode 300 	Average Score: 255.23
Episode 400 	Average Score: 378.78
Episode 500 	Average Score: 500.00
Episode 600 	Average Score: 500.00
Episode 700 	Average Score: 500.00
Episode 800 	Average Score: 500.00
Episode 900 	Average Score: 500.00
Episode 1000 	Average Score: 422.94


# Evaluation Method

In [30]:
def evaluate_agent(env, max_steps, n_eval_episodes, policy):
  episode_rewards = []
  for episode in range(n_eval_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards_ep = 0

    for step in range(max_steps):
      action, _ = policy.act(state)
      new_state, reward, done, info = env.step(action)
      total_rewards_ep += reward

      if done:
        break

      state = new_state
    episode_rewards.append(total_rewards_ep)
  mean_reward = np.mean(episode_rewards)
  std_reward = np.std(episode_rewards)

  return mean_reward, std_reward

In [31]:
evaluate_agent(eval_env, cartpole_hyperparameters["max_t"],
               cartpole_hyperparameters["n_evaluation_episodes"],
               cartpole_policy)

  if not isinstance(terminated, (bool, np.bool8)):


(500.0, 0.0)