In [1]:
from matplotlib import pyplot as plt
from torch import optim
from tqdm import tqdm
from utils import get_epsilon, get_weights, load_buffer, make_env, test_agent, UniformReplay


import ale_py
import gymnasium as gym
import numpy as np
import os
import random
import torch
import torch.nn as nn
import torch.nn.functional as F

VIDS = './vids'
WEIGHTS = './weights_buffers'
os.makedirs(VIDS, exist_ok=True)  # Make directories to store items made during training and testing
os.makedirs(WEIGHTS, exist_ok=True)

In [2]:
class QNet(nn.Module):
    """Defines the DeepMind network architecture (i.e. model) to
       learn a given environment. The architecture is modeled 
       after the one proposed in 2015 with minor modifications.
       This also incorporates dueling architecture.
    """

    def __init__(self, obs_space:gym.spaces.box, act_space:gym.spaces.discrete, dueling:bool=False):
        """Defines the network building blocks in order to learn 
        a given environment.
        
        Parameters:
            - obs_space (gym.spaces.box) : The observation space
                                          of the environment.
            - act_space (gym.spaces.discrete) : The action space 
                                               of the environment.
            - dueling (bool) : whether or not to use the dueling 
                              network architecture. Defaulted to 
                              False.

        Returns:
            - None
        """

        super().__init__()

        in_channels = obs_space.shape[0]
        out_features = act_space.n
        self.dueling = dueling

        self.conv = nn.Sequential(nn.Conv2d(in_channels=in_channels, out_channels=32, kernel_size=8, stride=4),
                                  nn.ReLU(),
                                  nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
                                  nn.ReLU(),
                                  nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
                                  nn.ReLU())
        
        self.fc1 = nn.Sequential(nn.Flatten(), 
                                 nn.Linear(in_features=64*7*7, out_features=512),
                                 nn.ReLU())
        
        # Create dueling streams
        if self.dueling:
            self.val_stream = nn.Linear(in_features=512, out_features=1)
            self.adv_stream = nn.Linear(in_features=512, out_features=out_features)

        # Normal architecture
        else:
            self.fc2 = nn.Linear(in_features=512, out_features=out_features)

    def forward(self, imgs:torch.tensor) -> torch.tensor:
        """Defines the forward pass of the network. In particular,
        the defined network, takes in as an input a tensor of states
        or single state and produces Q-vals for all available actions 
        for each state (or all actions for a single state).
        
        Parameters:
            - imgs (torch.Tensor) : the stacked images representing 
                                    the "state" of the environment.
                                    The tensor should be of shape 
                                    N x 84 x 84 x m, where N is the
                                    batch size of states given, and
                                    m = obs_space.shape[0].
        Returns:
        - (torch.Tensor) : The approximated Q-values Q(s,a), for the 
                           given states/state tensor. The tensor is 
                           of shape N x action_space.n.
        """
        
        x_fc1 = self.fc1(self.conv(imgs))

        # Compute Q(s,a) using A(s,a)
        if self.dueling:

            state_vals = self.val_stream(x_fc1)
            adv_vals = self.adv_stream(x_fc1)
            q_vals = state_vals + (adv_vals - adv_vals.mean(dim=1, keepdim=True))

        else:
            q_vals = self.fc2(x_fc1)

        return q_vals

In [None]:
def DQN(env:gym.Env, device:torch.device, optimizer_LR:float=0.00025, α:float=0.95, eps:float=0.01, 
        objective:nn.functional=nn.HuberLoss(), batch_size:int=32, buffer_size:int=int(1e6), 
        buffer_start_size:int=50_000, gamma:float=0.99, num_episodes:int=int(3e6), 
        episodes_decay:int=int(6e5), epsilon_start:float=1.0, epsilon_min:float=0.1, 
        grad_update_freq:int=4, target_update_freq:int=10_000, save_freq:int=5_000, use_DDQN:bool=False, 
        use_dueling:bool=False) -> None:
    """Perform a modified version of DeepMind's DQN algorithm 
    to train an agent to learn and solve the given environment.
    The function will naturally seek to load an already saved
    trained model's weights to continue training. If none is 
    found, it will start one from scratch and save the weights
    as frequently as stated by save_frequency.

    NOTE: many of the default parameters are exact or close to 
    those given by the 2015 paper.

    Parameters: 
        - env (gym.Env) : the environment to emulate.
        - device (torch.device) : the device tag indicating whe-
                                 re to load to load the tensor.
        - optimizer_LR (float) : the learning rate for the RMSProp
                                optimizer to use. Defaulted to
                                0.00025.
        - α (float) : a float for squared gradient momentum.
                      Defaulted to 0.95.
        - eps (float) : constant added to denominator for RMSprop.
                        Defaulted to 0.01.
        - objective (nn.functional) : the objective loss function
                                      to optimize. Defaulted to
                                      nn.HuberLoss().
        - batch_size (int) : the number of experiences to sample
                            from a created replay buffer (i.e.
                            Memory object). Defaulted to 32.
        - buffer_size (int) : the max number of experiences the 
                             replay buffer can hold before ol-
                             der experiences are deleted (i.e.
                             the capacity of the buffer). Def-
                             aulted to 1_000_000.
        - buffer_start_size (int) : the minimum number of expe-
                                   riences the replay buffer 
                                   must hold before sampling is
                                   allowed. This refers to the
                                   size the buffer must be in-
                                   itialized to. Defaulted to
                                   50_000.
        - gamme (float) : the discount factor to use for calcul-
                         ating future rewards. Defaulted to 
                         0.99.
        - num_episodes (int) : the number of episodes used to t-
                              rain the agent. Defaulted to 
                              3_000_000.
        - episodes_decay (int) : the number of episodes over w-
                                hich to linearly decay a spec-
                                ified starting value of epsilon 
                                to a specified ending epsilon 
                                value, thereafter remaining co-
                                nstant to the specified ending
                                value, in the epsilon-greedy
                                policy. Defaulted to 600_000.
        - epsilon_start (float) : the starting value of epsilon.
                                 Defaulted to 1.
        - epsilon_min (float) : the ending value of epsilon. De-
                               faulted to 0.1
        - grad_update_freq (int) : the number of time steps the
                                  agent traverses between suc-
                                  ccessive gradient descent up-
                                  dates. This implies the num-
                                  ber of actions taken by agent
                                  between updates. Defaulted 
                                  to 4.
        - target_update_freq (int) : the number of parameter up-
                                    dates between successive 
                                    updates to the weights of 
                                    the target Q-network. By
                                    parameter updates, this to
                                    be understood as the number
                                    of times gradient descent
                                    gets performed on the loss
                                    (which has frequency grad_
                                    update_freq). This value is
                                    defaulted to 10_000.
        - save_freq (int) : the number of episodes the agent exp-
                           eriences between successive savings
                           of the online Q-network. Defaulted 
                           to 5_000.
        - use_DDQN (bool) : whether or not to use Double DQN
                           for the algorithm. Defaulted to 
                           False.
        - use_dueling (bool) : whether or not to use dueling
                               architecture. Defaulted to False.
                              
    Returns:
        - None
    """

    buffer = UniformReplay(capacity=buffer_size, min_size=buffer_start_size)
    buffer.initialize(env)

    Qnet = QNet(env.observation_space, env.action_space, dueling=use_dueling).to(device)    
    target_Qnet = QNet(env.observation_space, env.action_space, dueling=use_dueling).to(device)

    recent_weights = get_weights(dir=WEIGHTS, device=device)
    if recent_weights is None:
        print('Initializing with random weights.')
        target_Qnet.load_state_dict(Qnet.state_dict())

    else:
        print('Working from a trained model.')
        Qnet.load_state_dict(recent_weights)
        target_Qnet.load_state_dict(recent_weights)
        buffer = load_buffer(dir=WEIGHTS)

    optimizer = optim.RMSprop(params=Qnet.parameters(), lr=optimizer_LR, alpha=α, eps=eps)
    timesteps = 0
    parameters_updated = 0

    pbar = tqdm(iterable=range(1, num_episodes+1), desc='Episode', position=0)

    for episode in range(num_episodes):
    #for episode in range(num_episodes):
        
        terminated = False
        state, info = env.reset()
        ε = get_epsilon(episode, episodes_decay, epsilon_start, epsilon_min)
        episodic_loss = 0
        loss_ct = 0

        while not terminated:

            if random.uniform(0, 1) < ε:
                action = np.array(env.action_space.sample())

            else:
                with torch.no_grad():
                    Qvals = Qnet(torch.tensor(state, dtype=torch.float).unsqueeze(0).to(device))
                    action = Qvals.argmax(dim=1).cpu().numpy().item()
            
            next_state, reward, terminated, trunc, info = env.step(action)
            buffer.add(state, action, next_state, reward, terminated)
            timesteps += 1

            if timesteps % grad_update_freq == 0:
                experience_batch = buffer.sample(batch_size=batch_size)
                next_states = (torch.tensor(experience_batch[2], dtype=torch.float)).to(device)
                rewards = (torch.tensor(experience_batch[3], dtype=torch.float)).to(device)
                terminations = (torch.tensor(experience_batch[4], dtype=torch.float)).to(device)

                # Compute y_j
                if use_DDQN:
                        online_Qvals = Qnet(next_states)
                        online_best_Qindices = online_Qvals.argmax(dim=1, keepdim=True)                                       # Use of keepdim keeps the output with the same shape except now dimension 1 is of size 1
                        target_Qvals = target_Qnet(next_states)
                        target_selected_Qvals = target_Qvals.gather(dim=1, index=online_best_Qindices)                        # This use of keepdim allows to not have to use unsqueeze(1) in online_best_Qindices
                        y_j = (rewards.unsqueeze(-1) + gamma * target_selected_Qvals * (1. - terminations.unsqueeze(-1).float())).squeeze(-1)

                else:
                    with torch.no_grad():
                        max_Qvals_next_state = target_Qnet(next_states).max(dim=1).values
                        y_j = rewards.unsqueeze(-1) + gamma * max_Qvals_next_state * (1. - terminations.unsqueeze(-1).float())

                states = torch.tensor(experience_batch[0], dtype=torch.float).to(device)
                actions = torch.tensor(experience_batch[1]).to(device)
                curr_Qvals = Qnet(states).gather(dim=1, index=actions.unsqueeze(1)).squeeze(-1)   
                loss = objective(y_j, curr_Qvals)
                episodic_loss += loss.detach().item()
                loss_ct += 1

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                parameters_updated += 1

            # Every C steps update target Q
            if (parameters_updated % target_update_freq == 0) and (parameters_updated > 0):
                target_Qnet.load_state_dict(Qnet.state_dict())

            state = next_state
        avg_loss = episodic_loss / loss_ct if loss > 0 else 0
        episodic_reward = info['episode']['r']
        pbar.set_postfix(episodic_reward=episodic_reward, avg_loss=avg_loss)
        pbar.update()


        # Save model weights (including at last episode)
        if (episode % save_freq == 0) or (episode == num_episodes - 1):
            filename = '/model_weights' + f"_episode{episode}.pth"
            torch.save(Qnet.state_dict(), WEIGHTS+filename)

            if episode == num_episodes - 1:
                buffer.save(drl_directory=WEIGHTS)

    env.close()

In [4]:
train_env = make_env(vid_dir=VIDS) # Use default breakout game
device = torch.device("cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

A.L.E: Arcade Learning Environment (version 0.10.2+c9d4b19)
[Powered by Stella]
  logger.warn(


In [5]:
DQN(env=train_env, device=device, buffer_size=5_000, buffer_start_size=1_000, num_episodes=100, target_update_freq=150, episodes_decay=30, use_DDQN=True, use_dueling=True)

Initializing memory buffer...
Memory buffer initialized.
Initializing with random weights.


Episode: 100%|██████████| 100/100 [02:33<00:00,  1.53s/it, avg_loss=0.0036, episodic_reward=2] 

Replay buffer saved to ./weights_buffers/replay_buffer.pickle.





In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
test_env = make_env(VIDS, recording_freq=5, vid_name_prefix='drl-test-video')

In [None]:
test_agent(QNet, test_env, device, WEIGHTS, uses_dueling=True, num_test_episodes=2)