In [2]:
## all packages
! pip install gymnasium[all]
!apt-get install git
!git clone https://github.com/magni84/gym_RLcourse.git
%cd gym_RLcourse
!pip install -e .
!pip install matplotlib
!pip install pygame

! pip install gymnasium 'gymnasium[atari]' 'gymnasium[accept-rom-license]'

import gymnasium as gym
import gym_RLcourse
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import clear_output # Used to clear the ouput of a Jupyter cell.

from collections import namedtuple, deque
from itertools import count
import math
import random
### Torch imports
import torch
# from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

import torch.nn as nn
import torch.nn.functional as F
import argparse

Collecting box2d-py==2.3.5 (from gymnasium[all])
  Using cached box2d-py-2.3.5.tar.gz (374 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting swig==4.* (from gymnasium[all])
  Using cached swig-4.2.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
Collecting mujoco-py<2.2,>=2.1 (from gymnasium[all])
  Using cached mujoco_py-2.1.2.14-py3-none-any.whl (2.4 MB)
Collecting cython<3 (from gymnasium[all])
  Using cached Cython-0.29.37-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (1.9 MB)
Collecting mujoco>=2.3.3 (from gymnasium[all])
  Using cached mujoco-3.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.5 MB)
Collecting lz4>=3.1.0 (from gymnasium[all])
  Using cached lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Collecting glfw (from mujoco>=2.3.3->gymnasium[all])
  Using cached glfw-2.7.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.

In [None]:
## config.py
"""
In this file, you may edit the hyperparameters used for different environments.

memory_size: Maximum size of the replay memory.
n_episodes: Number of episodes to train for.
batch_size: Batch size used for training DQN.
target_update_frequency: How often to update the target network.
train_frequency: How often to train the DQN.
gamma: Discount factor.
lr: Learning rate used for optimizer.
eps_start: Starting value for epsilon (linear annealing).
eps_end: Final value for epsilon (linear annealing).
anneal_length: How many steps to anneal epsilon for.
n_actions: The number of actions can easily be accessed with env.action_space.n, but we do
    some manual engineering to account for the fact that Pong has duplicate actions.
"""

# Hyperparameters for CartPole-v1
CartPole = {
    'memory_size': 50000,
    'n_episodes': 1000,
    'batch_size': 32,
    'target_update_frequency': 100,
    'train_frequency': 1,
    'gamma': 0.95,
    'lr': 1e-4,
    'eps_start': 1.0,
    'eps_end': 0.05,
    'anneal_length': 10**4,
    'n_actions': 2,
    'epsilon': 0.9
}

In [None]:
## utils.py
# import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def preprocess(obs, env):
    """Performs necessary observation preprocessing."""
    if env in ['CartPole-v1']:
        return torch.tensor(obs, device=device).float()
    else:
        raise ValueError('Please add necessary observation preprocessing instructions to preprocess() in utils.py.')


In [None]:
## dqn.py

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class ReplayMemory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def __len__(self):
        return len(self.memory)

    def push(self, obs, action, next_obs, reward):
        if len(self.memory) < self.capacity:
            self.memory.append(None)

        self.memory[self.position] = (obs, action, next_obs, reward)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        """
        Samples batch_size transitions from the replay memory and returns a tuple
            (obs, action, next_obs, reward)
        """
        sample = random.sample(self.memory, batch_size)
        return tuple(zip(*sample))


class DQN(nn.Module):
    def __init__(self, env_config):
        super(DQN, self).__init__()

        # Save hyperparameters needed in the DQN class.
        self.batch_size = env_config["batch_size"]
        self.gamma = env_config["gamma"]
        self.eps_start = env_config["eps_start"]
        self.eps_end = env_config["eps_end"]
        self.anneal_length = env_config["anneal_length"]
        self.n_actions = env_config["n_actions"]

        self.epsilon = env_config["epsilon"]

        self.fc1 = nn.Linear(4, 256)
        self.fc2 = nn.Linear(256, self.n_actions)

        self.relu = nn.ReLU()
        self.flatten = nn.Flatten()

    def forward(self, x):
        """Runs the forward pass of the NN depending on architecture."""
        x = self.relu(self.fc1(x))
        x = self.fc2(x)

        return x

    def act(self, observation, exploit=False):
        """Selects an action with an epsilon-greedy exploration strategy."""
        # TODO: Implement action selection using the Deep Q-network. This function
        #       takes an observation tensor and should return a tensor of actions.
        #       For example, if the state dimension is 4 and the batch size is 32,
        #       the input would be a [32, 4] tensor and the output a [32, 1] tensor.
        # TODO: Implement epsilon-greedy exploration.

        # Ensure observation is a PyTorch tensor
        observation = torch.tensor(observation)

        # Pass observation through DQN to get Q-values
        q_values = self.forward(observation)

        # Explore with epsilon probability
        if random.random() < self.epsilon:
            # Choose a random action
            action = torch.randint(0, q_values.size(1), (1,))
        else:
            # Choose the action with the highest Q-value
            action = q_values.argmax(dim=1)

        return action


        raise NotImplmentedError

def optimize(dqn, target_dqn, memory, optimizer):
    """This function samples a batch from the replay buffer and optimizes the Q-network."""
    # If we don't have enough transitions stored yet, we don't train.
    if len(memory) < dqn.batch_size:
        return

    # TODO: Sample a batch from the replay memory and concatenate so that there are
    #       four tensors in total: observations, actions, next observations and rewards.
    #       Remember to move them to GPU if it is available, e.g., by using Tensor.to(device).
    #       Note that special care is needed for terminal transitions!
    observations, actions, next_observations, rewards = memory.sample()
    observations.to(device)
    actions.to(device)
    next_observations.to(device)
    rewards.to(device)

    # TODO: Compute the current estimates of the Q-values for each state-action
    #       pair (s,a). Here, torch.gather() is useful for selecting the Q-values
    #       corresponding to the chosen actions.

    # TODO: Compute the Q-value targets. Only do this for non-terminal transitions!

    # Compute Q-values for the current state-action pairs
    q_values = dqn(observations)
    q_values = torch.gather(q_values, 1, actions.unsqueeze(1))

    # Compute Q-values for the next states
    next_q_values = target_dqn(next_observations)
    next_q_values = next_q_values.max(1)[0].detach()

    q_value_targets = rewards + dqn.gamma * next_q_values * (1 - terminals)
    # Compute loss.
    loss = F.mse_loss(q_values.squeeze(), q_value_targets)

    # Perform gradient descent.
    optimizer.zero_grad()

    loss.backward()
    optimizer.step()

    return loss.item()


In [None]:
## train.py


# import gymnasium as gym
# import torch

# import copy

# import config
# from utils import preprocess
# from evaluate import evaluate_policy
# from dqn import DQN, ReplayMemory, optimize

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# parser = argparse.ArgumentParser()
# parser.add_argument('--env', choices=['CartPole-v1'], default='CartPole-v1')
# parser.add_argument('--evaluate_freq', type=int, default=25, help='How often to run evaluation.', nargs='?')
# parser.add_argument('--evaluation_episodes', type=int, default=5, help='Number of evaluation episodes.', nargs='?')
args = argparse.Namespace(
    env='CartPole-v1',
    evaluate_freq=5,
    evaluation_episodes=1000
)


# Hyperparameter configurations for different environments. See config.py.
ENV_CONFIGS = {
    'CartPole-v1': config.CartPole
}

def evaluate_policy(dqn, env, env_config, args, n_episodes, render=False, verbose=False):
    """Runs {n_episodes} episodes to evaluate current policy."""
    total_return = 0
    for i in range(n_episodes):
        obs, info = env.reset()
        obs = preprocess(obs, env=args.env).unsqueeze(0)

        terminated = False
        episode_return = 0

        while not terminated:
            if render:
                env.render()

            action = dqn.act(obs, exploit=True)
            obs, reward, terminated, truncated, info = env.step(action)
            obs = preprocess(obs, env=args.env).unsqueeze(0)

            episode_return += reward

        total_return += episode_return

        if verbose:
            print(f'Finished episode {i+1} with a total return of {episode_return}')


    return total_return / n_episodes

if __name__ == '__main__':
    # args = parser.parse_args()

    # Initialize environment and config.
    env = gym.make(args.env)
    env_config = ENV_CONFIGS[args.env]

    # Initialize deep Q-networks.
    dqn = DQN(env_config=env_config).to(device)

    # TODO: Create and initialize target Q-network.
    dqn_target = copy.deepcopy(dqn)
    # Create replay memory.
    memory = ReplayMemory(env_config['memory_size'])

    # Initialize optimizer used for training the DQN. We use Adam rather than RMSProp.
    optimizer = torch.optim.Adam(dqn.parameters(), lr=env_config['lr'])

    # Keep track of best evaluation mean return achieved so far.
    best_mean_return = -float("Inf")

    for episode in range(env_config['n_episodes']):
        terminated = False
        obs, info = env.reset()
        if episode == 0:
            print(type(obs))
            print(obs)


        obs = preprocess(obs, env=args.env).unsqueeze(0)
        if episode == 0:
            print(type(obs))
            print(obs)
        step = 1
        while not terminated:
            # TODO: Get action from DQN.
            ##action = None
            action = dqn.act(obs).item()


            # Act in the true environment.
            old_obs = obs.clone().detach()
            obs, reward, terminated, truncated, info = env.step(action)

            # Preprocess incoming observation.
            if not terminated:
                obs = preprocess(obs, env=args.env).unsqueeze(0)

            # TODO: Add the transition to the replay memory. Remember to convert
            #       everything to PyTorch tensors!
            memory.push(torch.tensor(old_obs), torch.tensor(action), torch.tensor(obs), torch.tensor(reward))
            # TODO: Run DQN.optimize() every env_config["train_frequency"] steps.
            if step%env_config["train_frequency"] == 0:
                pass
            # TODO: Update the target network every env_config["target_update_frequency"] steps.
            if step%env_config["target_update_frequency"] == 0:
                pass
            step += 1

        # Evaluate the current agent.
        if episode % args.evaluate_freq == 0:
            mean_return = evaluate_policy(dqn, env, env_config, args, n_episodes=args.evaluation_episodes)
            print(f'Episode {episode+1}/{env_config["n_episodes"]}: {mean_return}')

            # Save current agent if it has the best performance so far.
            if mean_return >= best_mean_return:
                best_mean_return = mean_return

                print('Best performance so far! Saving model.')
                torch.save(dqn, f'models/{args.env}_best.pt')

    # Close environment after training is completed.
    env.close()


In [None]:
## evaluate.py

import argparse

import gymnasium as gym
import torch

import config
from utils import preprocess

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# parser = argparse.ArgumentParser()
# parser.add_argument('--env', choices=['CartPole-v1'], default='CartPole-v1')
# parser.add_argument('--path', type=str, help='Path to stored DQN model.')
# parser.add_argument('--n_eval_episodes', type=int, default=1, help='Number of evaluation episodes.', nargs='?')
# parser.add_argument('--render', dest='render', action='store_true', help='Render the environment.')
# parser.add_argument('--save_video', dest='save_video', action='store_true', help='Save the episodes as video.')
# parser.set_defaults(render=False)
# parser.set_defaults(save_video=False)
args = argparse.Namespace(
    # env='CartPole-v1',
    # evaluate_freq=25,
    # evaluation_episodes=5
    env = 'CartPole-v1',
    path = 'models/CartPole-v1_best.pt',
    n_eval_episodes = 1000,
    render = False,
    save_video = False
)


# Hyperparameter configurations for different environments. See config.py.
ENV_CONFIGS = {
    'CartPole-v1': config.CartPole,
}


def evaluate_policy(dqn, env, env_config, args, n_episodes, render=False, verbose=False):
    """Runs {n_episodes} episodes to evaluate current policy."""
    total_return = 0
    for i in range(n_episodes):
        obs, info = env.reset()
        obs = preprocess(obs, env=args.env).unsqueeze(0)

        terminated = False
        episode_return = 0

        while not terminated:
            if render:
                env.render()

            action = dqn.act(obs, exploit=True).item()
            obs, reward, terminated, truncated, info = env.step(action)
            obs = preprocess(obs, env=args.env).unsqueeze(0)

            episode_return += reward

        total_return += episode_return

        if verbose:
            print(f'Finished episode {i+1} with a total return of {episode_return}')


    return total_return / n_episodes

if __name__ == '__main__':
    # args = parser.parse_args()

    # Initialize environment and config
    env = gym.make(args.env)
    env_config = ENV_CONFIGS[args.env]

    if args.save_video:
        env = gym.make(args.env, render_mode='rgb_array')
        env = gym.wrappers.RecordVideo(env, './video/', episode_trigger=lambda episode_id: True)

    # Load model from provided path.
    dqn = torch.load(args.path, map_location=torch.device('cpu'))
    dqn.eval()

    mean_return = evaluate_policy(dqn, env, env_config, args, args.n_eval_episodes, render=args.render and not args.save_video, verbose=True)
    print(f'The policy got a mean return of {mean_return} over {args.n_eval_episodes} episodes.')

    env.close()