In [None]:
!pip install gym[atari,accept-rom-license]==0.25.2
import sys, os
import gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


In [22]:
import torch
import torchvision
import numpy as np
import random
from gym.spaces import Box
from collections import deque

steps = 3 # 3 for multistep learning, 1 for standard

class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = False
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info


class GrayScaleObservation(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        obs_shape = self.observation_space.shape[:2]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        observation = np.transpose(observation, (2, 0, 1))
        observation = torch.tensor(observation.copy(), dtype=torch.float)
        transform = torchvision.transforms.Grayscale()
        observation = transform(observation)
        return observation


class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        super().__init__(env)
        self.shape = (shape, shape) if isinstance(shape, int) else tuple(shape)
        obs_shape = self.shape + self.observation_space.shape[2:]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        transforms = torchvision.transforms.Compose([torchvision.transforms.Resize(self.shape),
                                                     torchvision.transforms.Normalize(0, 255)])
        return transforms(observation).squeeze(0)


class ExperienceReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def __len__(self):
        return len(self.memory)

    def store(self, state, next_state, action, reward, done):
        state = state.__array__()
        next_state = next_state.__array__()

        if steps>1 and len(self.memory) >= steps:
          r = sum(self.memory[-i][3] * (gamma ** (steps - i)) for i in range(steps, 0, -1))
          tmp = self.memory[-(steps)][:3] + (r,) + self.memory[-(steps)][4:]
          self.memory[-(steps)] = tmp

        self.memory.append((state, next_state, action, reward, done))


    def sample(self, batch_size):
        # TODO: uniformly sample batches of Tensors for: state, next_state, action, reward, done
        batch = random.sample(self.memory, batch_size)
        state, next_state, action, reward, done = zip(*batch)

        state = np.array(state)
        next_state = np.array(next_state)

        return torch.tensor(state), torch.tensor(next_state), torch.tensor(action), torch.tensor(reward), torch.tensor(done)


In [29]:
import torch
import gym
import numpy as np
import copy
from gym.wrappers import FrameStack


env_rendering = False    # Set to False while training your model on Colab
testing_mode = False
test_model_directory = './your_saved_model.pth.tar'
run_as_ddqn = True

# Create and preprocess the Atari Breakout environment
if env_rendering:
    env = gym.make("ALE/Breakout-v5", full_action_space=False, render_mode="human")
else:
    env = gym.make("ALE/Breakout-v5", full_action_space=False)
env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env)
env = ResizeObservation(env, shape=84)
env = FrameStack(env, num_stack=4)
image_stack, h, w = env.observation_space.shape
num_actions = env.action_space.n
print('Number of stacked frames: ', image_stack)
print('Resized observation space dimensionality: ', h, w)
print('Number of available actions by the agent: ', num_actions)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

seed = 61
env.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.backends.cudnn.enabled:
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

# Hyperparameters (to be modified
batch_size = 20
alpha = 0.00025
gamma = 0.95
eps, eps_decay, min_eps = 1.0, 0.999, 0.05
buffer = ExperienceReplayMemory(5000)
burn_in_phase = 1000
sync_target = 2000
max_train_frames = 1500
max_train_episodes = 4000
max_test_episodes = 20
curr_step = 0


Number of stacked frames:  4
Resized observation space dimensionality:  84 84
Number of available actions by the agent:  4
cuda


  deprecation(
  deprecation(


In [24]:
def convert(x):
    return torch.tensor(x.__array__()).float()


class DeepQNet(torch.nn.Module):
    def __init__(self, h, w, image_stack, num_actions):
        super(DeepQNet, self).__init__()
        # TODO: create a convolutional neural network

        self.conv1 = torch.nn.Conv2d(in_channels=image_stack, out_channels=32, kernel_size=8, stride=4)

        self.act1 = torch.nn.ReLU()

        self.conv2 = torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2)

        self.act2 = torch.nn.ReLU()

        self.conv3 = torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1)

        self.act3 = torch.nn.ReLU()


        self.flat = torch.nn.Flatten()

        self.fc1 = torch.nn.Linear(64 * 7 * 7, 256)

        self.act4 = torch.nn.ReLU()

        self.fc2 = torch.nn.Linear(256, num_actions)


    def forward(self, x):
        # TODO: forward pass from the neural network
        x = self.conv1(x)
        x = self.act1(x)

        x = self.conv2(x)
        x = self.act2(x)

        x = self.conv3(x)
        x = self.act3(x)

        x = self.flat(x)

        x = self.fc1(x)
        x = self.act4(x)

        x = self.fc2(x)

        return x


# TODO: create an online and target DQN (Hint: Use copy.deepcopy() and requires_grad utilities!)
# ...
online_dqn = DeepQNet(h, w, image_stack, num_actions)
target_dqn = copy.deepcopy(online_dqn)
for p in target_dqn.parameters():
    p.requires_grad = False

online_dqn.to(device)
target_dqn.to(device)


# TODO: create the appropriate MSE criterion and Adam optimizer
# ...
optimizer = torch.optim.Adam(online_dqn.parameters(), lr=alpha)

# Set this variable to False to run HuberLoss criterion in task 1.c)
mse_vs_hubber = True

if mse_vs_hubber:
  criterion = torch.nn.MSELoss()
else:
  criterion = torch.nn.HuberLoss()



In [25]:
def policy(state, is_training):
    global eps
    state = convert(state).unsqueeze(0).to(device)

    # TODO: Implement an epsilon-greedy policy
    # ...
    if is_training and np.random.rand() < eps:
        action = torch.randint(0, num_actions, (1,), dtype=torch.long)
    elif is_training:
        q_vals = online_dqn(state)
        action = torch.argmax(q_vals).item()
    else:
        with torch.no_grad():
            q_vals = online_dqn(state)
            action = torch.argmax(q_vals).item()
    return torch.tensor(action)


def compute_loss(state, action, reward, next_state, done):
    state = convert(state).to(device)
    next_state = convert(next_state).to(device)
    action = action.to(device)
    reward = torch.tensor(reward).to(device)
    done = torch.tensor(done).to(device)

    # TODO: Compute the DQN (or DDQN) loss based on the criterion

    if testing_mode:
      state = state.unsqueeze(0)
      next_state = next_state.unsqueeze(0)

    action = action.view(-1,1)
    reward = reward.view(-1,1)
    done = done.view(-1,1)
    done = done.int()


    with torch.no_grad():
      if run_as_ddqn:
        next_q_vals_1 = online_dqn(next_state)
        max_next_q_vals_1 = torch.argmax(next_q_vals_1,dim=1).view(-1,1)
        next_q_vals_2 = target_dqn(next_state)
        max_next_q_vals = next_q_vals_2.gather(1,max_next_q_vals_1)
      else:
        next_q_vals = target_dqn(next_state)
        max_next_q_vals = torch.max(next_q_vals, dim=1)[0]
        max_next_q_vals = max_next_q_vals.view(-1,1)
        max_next_q_vals = max_next_q_vals.detach()

    target_q = reward + (gamma ** (steps)) * max_next_q_vals * (1-done)

    prediction_current_q = online_dqn(state).gather(1, action)

    loss = criterion(prediction_current_q, target_q)

    return loss


def run_episode(curr_step, buffer, is_training):
    global eps
    global target_dqn
    episode_reward, episode_loss = 0, 0.
    state = env.reset()


    for t in range(max_train_frames):
        action = policy(state, is_training)
        curr_step += 1
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward

        if is_training:
            buffer.store(state, next_state, action, reward, done)

            if curr_step > burn_in_phase:
                state_batch, next_state_batch, action_batch, reward_batch, done_batch = buffer.sample(batch_size)

                if curr_step % sync_target == 0:
                    # TODO: Periodically update your target_dqn at each sync_target frames
                    # ...
                    target_dqn.load_state_dict(online_dqn.state_dict())

                loss = compute_loss(state_batch, action_batch, reward_batch, next_state_batch, done_batch)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                episode_loss += loss.item()
        else:
            with torch.no_grad():
                episode_loss += compute_loss(state, action, reward, next_state, done).item()

        state = next_state

        if done:
            break

    return dict(reward=episode_reward, loss=episode_loss / t), curr_step


In [26]:
def update_metrics(metrics, episode):
    for k, v in episode.items():
        metrics[k].append(v)


def print_metrics(it, metrics, is_training, window=100):
    reward_mean = np.mean(metrics['reward'][-window:])
    loss_mean = np.mean(metrics['loss'][-window:])
    mode = "train" if is_training else "test"
    print(f"Episode {it:4d} | {mode:5s} | reward {reward_mean:5.5f} | loss {loss_mean:5.5f}")


def save_checkpoint(curr_step, eps, train_metrics):
    save_dict = {'curr_step': curr_step,
                 'train_metrics': train_metrics,
                 'eps': eps,
                 'online_dqn': online_dqn.state_dict(),
                 'target_dqn': target_dqn.state_dict()}
    torch.save(save_dict, './your_saved_model.pth.tar')


In [27]:
# TODO: Plot your train_metrics and test_metrics
# ...
def plot_metrics(name, metrics):

  fig, ax_list = plt.subplots(1, 2, figsize=(12, 6))

  fig.suptitle('Atari game results - ' + name, fontsize=18)

  ax_list[0].plot(metrics['reward'], lw=2, color='red')
  ax_list[0].set_ylabel('Accumulated reward')
  ax_list[0].set_xlabel('Episode')

  if name.startswith('Testing'):
    ax_list[0].axhline(np.mean(metrics['reward']), color='black', linestyle='dashed',lw=3)

  ax_list[1].plot(metrics['loss'], lw=2, color='red')
  ax_list[1].set_xlabel('Episode')
  ax_list[1].set_ylabel('Loss')

  plt.tight_layout(pad=2.0)

  plt.savefig(f'Atarigame_{name}.pdf')

In [None]:
tmp = "ddqn" if run_as_ddqn else "dqn"
var = tmp + str(steps)
print(var)
if testing_mode:
    # TODO: Load your saved online_dqn model for evaluation
    # ...
    checkpoint = torch.load(test_model_directory)
    online_dqn.load_state_dict(checkpoint['online_dqn'])

    test_metrics = dict(reward=[], loss=[])
    for it in range(max_test_episodes):
        episode_metrics, curr_step = run_episode(curr_step, buffer, is_training=False)
        update_metrics(test_metrics, episode_metrics)
        print_metrics(it + 1, test_metrics, is_training=False)
    plot_metrics("Testing "+var, test_metrics)
else:
    train_metrics = dict(reward=[], loss=[])
    for it in range(max_train_episodes):
        episode_metrics, curr_step = run_episode(curr_step, buffer, is_training=True)
        update_metrics(train_metrics, episode_metrics)
        if curr_step > burn_in_phase and eps > min_eps:
            eps *= eps_decay
        if it % 50 == 0:
            print_metrics(it, train_metrics, is_training=True)
            save_checkpoint(curr_step, eps, train_metrics)

    plot_metrics("Training "+var, train_metrics)
