<a href="https://colab.research.google.com/github/murphybrendan/ml-courses/blob/main/huggingface/deep-rl/unit3/dqn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup

##Install necessary packages

In [15]:
!pip install pytest

  and should_run_async(code)




In [2]:
!pip install wandb einops pygame stable_baselines3
!pip install gymnasium[classic_control,box2d,atari]
!pip install gymnasium[accept-rom-license]

Collecting wandb
  Downloading wandb-0.17.0-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops
  Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting stable_baselines3
  Downloading stable_baselines3-2.3.2-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.3/182.3 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m25.6 MB/s[0m e

## Set up the virtual display

In [3]:
%%capture
!apt install python-opengl
!apt install xvfb
!pip3 install pyvirtualdisplay

In [4]:
# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7e8536a12e30>

## Imports and other things

In [5]:
import random
import torch
import torch.nn.functional as F
import numpy as np
import gymnasium as gym
import stable_baselines3

from gymnasium import Env
from gymnasium.spaces import Space, Discrete, Box
from gymnasium.vector import VectorEnv
from stable_baselines3.common.buffers import ReplayBuffer
from tqdm.notebook import trange, tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define Probe Environments

Since DQN is model-free, we can use test environments unrelated to our target environment to get a sense of if our algorithm is working. We want to use extremely simple environments that DQN should be able to learn very easily.

So what's the easiest thing our DQN agent can do?

*Nothing*

That's it. Just sit there and do nothing. We'll define an environment with a discrete action space with only one choice. Then there's only one timestep with will reward with +1. You can think of this as the agent learning V instead of Q.

In [91]:
from gymnasium import Env

class Probe1(Env):
    def __init__(self):
        super().__init__()
        self.observation_space = Discrete(1)
        self.action_space = Discrete(1)

    def step(self, action: int) -> tuple[int, float, bool, bool, dict]:
        return (0, 1.0, True, False, {})

    def reset(self, seed: int = None, options = None) -> tuple[int, dict]:
        super().reset(seed=seed)
        return self.observation_space.sample(), {}


gym.envs.registration.register(id="Probe1-v0", entry_point=Probe1)
env = gym.make("Probe1-v0")
assert env.observation_space.n == 1
assert env.action_space.n == 1

  and should_run_async(code)
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


Test that the environment works correctly using an existing agent.

In [11]:
def train_model(env):
    model = stable_baselines3.DQN("MlpPolicy", env, learning_rate=0.001)
    model.learn(total_timesteps=1000)
    return model

  and should_run_async(code)


See that the model learned that the value of taking action 0 (the only action) at state \[0\] is 1.

In [14]:
model = train_model(env)
model.q_net(torch.zeros((1,1), device=device))

tensor([[1.0000]], device='cuda:0', grad_fn=<AddmmBackward0>)

Make more probes

1.   An environment with two actions \[0, 1\]. Taking action 1 gives +1 reward, taking action 0 gives 0 reward. Tests that we're exploring actions.
1.   An environment with 2 actions. Taking action 1 advances to the next state and gives +1 reward. Taking action 0 does not advance the state and gives -1 reward. Tests the calculation using `dones`.
1.   An environment with 2 states and 2 actions. Taking the action of the state gives a reward. Tests state.





## Probe 2

In [16]:
class Probe2(gym.Env):
    def __init__(self):
        super().__init__()
        self.observation_space = Discrete(1)
        self.action_space = Discrete(2)

    def step(self, action: int) -> tuple[int, float, bool, bool, dict]:
        return (0, float(action), True, False, {})

    def reset(self, seed: int = None, options = None) -> tuple[int, dict]:
        super().reset(seed=seed)
        return self.observation_space.sample(), {}


gym.envs.registration.register(id="Probe2-v0", entry_point=Probe2)
env = gym.make("Probe2-v0")
assert env.observation_space.n == 1
assert env.action_space.n == 2

  and should_run_async(code)


In [17]:
model = train_model(env)
model.q_net(torch.zeros((1,1), device=device))

tensor([[1.0803e-06, 1.0000e+00]], device='cuda:0', grad_fn=<AddmmBackward0>)

## Probe 3

In [158]:
class Probe3(gym.Env):
    def __init__(self):
        super().__init__()
        self.observation_space = Discrete(1)
        self.action_space = Discrete(2)

    def step(self, action: int) -> tuple[int, float, bool, bool, dict]:
        return (0, 1.0 if action else -1.0, bool(action), False, {})

    def reset(self, seed: int = None, options = None) -> tuple[int, dict]:
        super().reset(seed=seed)
        return self.observation_space.sample(), {}


gym.envs.registration.register(id="Probe3-v0", entry_point=Probe3)
env = gym.make("Probe3-v0")
assert env.observation_space.n == 1
assert env.action_space.n == 2

  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


In [19]:
model = train_model(env)

  logger.deprecation(


In [20]:
model.q_net(torch.zeros((1,1), device=device))

tensor([[-0.0894,  1.0000]], device='cuda:0', grad_fn=<AddmmBackward0>)

## Probe 4



In [21]:
class Probe4(gym.Env):
    def __init__(self):
        super().__init__()
        self.observation_space = Discrete(2)
        self.action_space = Discrete(2)

    def step(self, action: int) -> tuple[int, float, bool, bool, dict]:
        return (self.state, float(action == self.state), True, False, {})

    def reset(self, seed: int = None, options = None) -> tuple[int, dict]:
        super().reset(seed=seed)
        self.state = self.observation_space.sample()
        return self.state, {}


gym.envs.registration.register(id="Probe4-v0", entry_point=Probe4)
env = gym.make("Probe4-v0")
assert env.observation_space.n == 2
assert env.action_space.n == 2

In [22]:
model = train_model(env)
model.q_net(torch.zeros((1,1), device=device))

tensor([[ 9.9999e-01, -1.9893e-05]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [23]:
model.q_net(torch.ones((1,1), device=device))

tensor([[8.0861e-05, 1.0000e+00]], device='cuda:0', grad_fn=<AddmmBackward0>)

## Test functions

In [57]:
probes = {
    "Probe1-v0": [(torch.zeros((1,1), device=device), torch.ones((1,1), device=device))],
    "Probe2-v0": [(torch.zeros((1,1), device=device), torch.tensor([[0.0, 1.0]]).to(device=device))],
    "Probe3-v0": [(torch.zeros((1,1), device=device), torch.tensor([[0.0, 1.0]]).to(device=device))],
    "Probe4-v0": [(torch.zeros((1,1), device=device), torch.tensor([[1.0, 0.0]]).to(device=device)),
                  (torch.ones((1,1), device=device), torch.tensor([[0.0, 1.0]]).to(device=device))]
}


def test_model_on_probe(env_id, model_class, **kwargs):
    env = gym.make(env_id)
    model = model_class(env=env, **kwargs)
    model.learn(total_timesteps=5000)
    for obs, value in probes[env_id]:
        torch.testing.assert_close(model.q_net(obs), value, atol=0.1, rtol=0.0)

def test_model_on_probes(model_class, **kwargs):
    for env_id in probes:
        test_model_on_probe(env_id, model_class, **kwargs)



In [58]:
test_model_on_probes(model_class=stable_baselines3.DQN, policy="MlpPolicy", learning_rate=0.001)

#DQN Implementation

Implement the Q-Network. It's a simple feed forward network with some number of hidden layers. The input dimension is the dimension of an observation, and the output dimension is the dimension of the action space.

In [59]:
class QNetwork(torch.nn.Module):
    def __init__(self, observation_dim, action_space_dim, hidden_layers=[128, 64]):
        super().__init__()
        layer_dim = [observation_dim] + hidden_layers
        layers = []
        for i in range(len(layer_dim)-1):
            layers.append(torch.nn.Linear(layer_dim[i], layer_dim[i+1]))
            layers.append(torch.nn.ReLU())
        layers.append(torch.nn.Linear(layer_dim[-1], action_space_dim))
        self.layers = torch.nn.Sequential(*layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.layers(x)

In [60]:
q = QNetwork(10, 2)
q

QNetwork(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=2, bias=True)
  )
)

Define the epsilon-greedy policy

In [160]:
def epsilon_greedy_policy(q_net: QNetwork, observation: np.ndarray, action_space: Space, eps: float):
    if random.random() < eps:
        return np.array([action_space.sample() for _ in range(observation.shape[0])])
    obs_tensor = torch.from_numpy(observation).to(device).float()
    action = torch.argmax(q_net(obs_tensor), dim=-1, keepdim=True).detach().cpu().numpy()
    return action

from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env.base_vec_env import VecEnv

class DQN:
    def __init__(self,
                 env: Env | VecEnv, buffer_size=1000000, batch_size=32, gamma=0.99, train_freq=4, exploration_initial_eps=1.0, exploration_final_eps=0.05, learning_starts=100, target_update_interval=10000, learning_rate=0.0001) -> None:
        self.env = env if isinstance(env, VecEnv) else make_vec_env(lambda: env)
        self.batch_size = batch_size
        self.gamma = gamma
        self.train_freq = train_freq
        self.target_update_interval = target_update_interval
        self.exploration_initial_eps = exploration_initial_eps
        self.exploration_final_eps = exploration_final_eps

        self.replay_buffer = ReplayBuffer(buffer_size, env.observation_space, env.action_space)

        obs_dim = np.array(env.observation_space.shape, dtype=int).prod()
        self.q_net = QNetwork(obs_dim, env.action_space.n).to(device)
        self.q_net_target = QNetwork(obs_dim, env.action_space.n).to(device)
        self.q_net_target.load_state_dict(self.q_net.state_dict())
        self.optimizer = torch.optim.SGD(self.q_net.parameters(), learning_rate)

        self.last_target_update = learning_starts
        self.last_training_step = learning_starts
        self.timestep = 0


    def collect_rollouts(self):
        action = epsilon_greedy_policy(self.q_net, self.prev_observation, self.env.action_space, self.eps)
        next_observation, reward, done, info = self.env.step(action)

        self.replay_buffer.add(self.prev_observation, next_observation, action, reward, done, info)
        self.prev_observation = next_observation
        self.timestep += self.env.num_envs

    def step(self):
        samples = self.replay_buffer.sample(self.batch_size)

        q_values = self.q_net(samples.observations.float())  # (batch_dim, action_space_dim)
        current_q_values = q_values.gather(1, samples.actions).squeeze(-1)  # (batch_dim)

        with torch.no_grad():
            next_q_values = self.q_net_target(samples.next_observations.float()).max(dim=1)[0]
            target_q_values = samples.rewards.squeeze(-1) + self.gamma * next_q_values * (1.0 - samples.dones.squeeze(-1))

        loss = F.mse_loss(current_q_values, target_q_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()



    def learn(self, total_timesteps):
        self.prev_observation = self.env.reset()
        self.prev_observation = np.array([self.prev_observation])
        for _ in trange(total_timesteps // self.env.num_envs):

            # Linear decay of epsilon over the course of training
            self.eps = self.exploration_initial_eps * (1 - self.timestep / total_timesteps) + self.exploration_final_eps * (self.timestep / total_timesteps)

            self.collect_rollouts()
            if self.last_training_step < self.timestep - self.train_freq:
                self.step()
                self.last_training_step = self.timestep
                if self.last_target_update < self.timestep - self.target_update_interval:
                    self.q_net_target.load_state_dict(self.q_net.state_dict())
                    self.last_target_update = self.timestep

test_model_on_probe("Probe4-v0", DQN, learning_rate=0.01, target_update_interval=100)


  0%|          | 0/5000 [00:00<?, ?it/s]

In [97]:
test_model_on_probes(DQN, learning_rate=0.001, target_update_interval=100)

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  loss = F.mse_loss(x, y)


AssertionError: Tensor-likes are not close!

Mismatched elements: 2 / 2 (100.0%)
Greatest absolute difference: 0.6621853709220886 at index (0, 1) (up to 0.1 allowed)
Greatest relative difference: inf at index (0, 0) (up to 0.0 allowed)

In [None]:
[
    ("Probe1-v0", torch.)
]

def test_probe1(model_class, **kwargs):
    env = env = gym.make("Probe1-v0")
    model