<a href="https://colab.research.google.com/github/mohamedyosef101/101_learning_area/blob/area/Reinforcement%20Learning/03_deep_q_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Source:** [huggingface.co](https://huggingface.co/learn/deep-rl-course/unit4/hands-on)

# Set up

In [None]:
!pip install gymnasium pygame pyglet pickle5 pyyaml pyvirtualdisplay
!sudo apt-get update
!sudo apt-get install -y python-opengl
!apt install ffmpeg xvfb
!pip install git+https://github.com/ntasfi/PyGame-Learning-Environment.git
!pip install git+https://github.com/simoninithomas/gym-games

In [None]:
# virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

# Import the packages

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from collections import deque

import torch
from torch import nn
import torch.nn.functional as F
from torch import optim
from torch.distributions import Categorical as cat

import gym
import gym_pygame

from huggingface_hub import notebook_login
import imageio

## Check GPU

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

# CartPole-v1

In [None]:
env_id = "CartPole-v1"

env = gym.make(env_id)
eval_env = gym.make(env_id)


# Get the state space and action space
states = env.observation_space.shape[0]
actions = env.action_space.n

print(f"""The State Space is: {states}
      \nThe Action Space is: {actions}""")

# Policy

In [None]:
class Policy(nn.Module):
  def __init__(self, states, actions, horizon):
    super(Policy, self).__init__()

    # two fully connected layers
    self.fc1 = nn.Linear(states, horizon)
    self.fc2 = nn.Linear(horizon, actions)

  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = self.fc2
    return F.softmax(x, dim=1)

  def act(self, state):
    state = torch.from_numpy(state).float().unsqueeze(0).to(device)
    probs = self.forward(state).cpu()
    m = cat(probs)
    action = m.sample()
    return action.item(), m.log_prob(actions)

In [None]:
# Check for mistakes
debug_policy = Policy(states, actions, 64).to(device)
debug_policy.act(env.reset())

# Reinforce Algorithm

In [None]:
def reinforce(policy, optimizer,
              n_training_episodes, max_t, gamma, print_every):
  # calculate the score
  scores_deque = deque(maxlen=100)
  scores = []

  for episode in range(1, n_training_episodes + 1):
    saved_log_probs = []
    rewards = []
    state = env.reset()

    for t in range(max_t):
      action, log_prob = policy.act(state)
      saved_log_probs.append(log_prob)
      state, reward, done, _ = env.step(action)
      rewards.append(reward)
      if done:
        break
    scores_deque.append(sum(rewards))
    scores.append(rewards)

    returns = deque(maxlen=max_t)
    n_steps = len(rewards)

    for t in range(n_steps)
