In [20]:
import torch
import torch.nn as nn
from torch import optim
import numpy as np

import random
import math

import matplotlib.pyplot as plt
import matplotlib
from IPython.display import clear_output

from collections import namedtuple, deque

import gymnasium as gym

import time

In [21]:
env = gym.make("CartPole-v1")

n_observations = 4
n_actions = 2

device = 'cpu'

BATCH_SIZE = 128
GAMMA = 0.99
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1000
LR = 1e-4
CAPACITY = 10000

steps_done = 0

In [4]:
class DQN(nn.Module):
    def __init__(self, n_observations, n_actions):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(n_observations, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions)
        )

    def forward(self, observation):
        logits = self.network(observation)
        return logits

We'll implement an epsilon-greedy policy, whith which we select a random action with probability epsilon and the action that maximizes Q otherwise. Here epsilon decays exponentially to a set minimum as the total number of steps increases.

How do we perform optimization in batches where a random action was chosen? If the random action was not what the network predicted would yield most reward, we'll be penalizing our model for predicting the correct action. What should I do then?

Maybe it doesn't matter. Since all we're predicting is the total reward given a certain action, it doesn't matter whether we chose a bad or good action. All that matters is how good the network's prediction was.

Yes, it doesn't matter. We are indeed picking the action with highest expected reward. Nevertheless, we're training the model to predict expected reward given an action, so it doesn't matter which one we picked.

Also, we input only one action at a time, not batches of actions. Batches come into play when we're computing the loss and optimizing the model.

In [5]:
def select_action(model, observation, train=True):
    global steps_done
    # gym's actions are always ndarrays
    tensor_observation = torch.from_numpy(observation)

    # Compute epsilon
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-EPS_DECAY * steps_done)

    with torch.no_grad():
        logits = model(tensor_observation)
        if sample > eps_threshold:
            # Pick action with biggest predicted reward
            action = logits.argmax().item()
        else:
            # Pick action at random
            action = env.action_space.sample()
    if train:
        steps_done += 1
    return action

We store the most recent state transitions up to CAPACITY which we'll randomly sample from to optimize our model. 

In [27]:
Transition = namedtuple('Transition', ['observation', 'action', 'reward', 'next_observation'])

class ReplayMemory():
    def __init__(self, capacity):
        self.replay_memory = deque(maxlen=capacity)

    def __len__(self):
        return len(self.replay_memory)

    def store(self, observation, action, reward, next_observation):
        self.replay_memory.append(Transition(observation, action, reward, next_observation))

    def sample(self, batch_size):
        sample_size = min(self.__len__(), batch_size)
        return random.sample(self.replay_memory, sample_size)

The visualization will continuously update the graph of the series of episode durations.

In [7]:
def update_plot(data, title, xlabel, ylabel, grid=True, sleep=0.5):
    clear_output(wait=True)
    plt.plot(data)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.grid(grid)
    plt.show()
    time.sleep(sleep)

Function to test the model. It'll continuously update a plot with the duration of each episode.

In [22]:
def test_model(model, n_episodes):
    durations = []
    for _ in range(n_episodes):
        episode_duration = 0
        observation, _ = env.reset()
        terminated, truncated = False, False
        while not (terminated or truncated):
            action, _ = select_action(model, observation, train=False)
            observation, _, terminated, truncated, _ = env.step(action)
            episode_duration += 1
        durations.append(episode_duration)
        update_plot(durations, 'Episode durations', 'Episode', 'Duration', grid=False)

In [23]:
memory = ReplayMemory(CAPACITY)
model = DQN(n_observations, n_actions).to(device)
loss_fn = nn.SmoothL1Loss()
optimizer = optim.AdamW(model.parameters(), lr=LR, amsgrad=True)

Everything below is a mess lol... I need to find a way to mask the final observations (not too hard, create tensor with 1s and 0s and multiply with Q) and figure out other things but it shouldn't take too long.

In [10]:
def train_model(model, n_episodes, loss_fn, optimizer):
    for _ in range(n_episodes):
        # Initialize env
        observation, _ = env.reset()
        terminated, truncated = False, False
        while not (terminated or truncated):
            # Select and execute action
            action = select_action(model, observation)
            next_observation, reward, terminated, truncated, _ = env.step(action)

            # CartPole always gives reward = 1 even when episode terminates
            # This fixes the problem
            if terminated or truncated:
                reward = 0.0

            # Store transition in memory
            memory.store(observation, action, reward, next_observation)

            # Sample and modify batch of transitions
            transitions = memory.sample(BATCH_SIZE)

            observation_batch = torch.from_numpy(np.vstack([t.observation for t in transitions]))
            action_batch = torch.from_numpy(np.vstack([t.action for t in transitions]))
            reward_batch = torch.from_numpy(np.vstack([t.reward for t in transitions]))
            next_observation_batch = torch.from_numpy(np.vstack([t.next_observation for t in transitions]))

            pred = model(observation_batch)
            next_pred = model()
            target = reward_batch + GAMMA

In [29]:
memory = ReplayMemory(CAPACITY)

observation, _ = env.reset()
terminated, truncated = False, False
while not (terminated or truncated):
    action = select_action(model, observation, train=False)
    next_observation, reward, terminated, truncated, _ = env.step(action)
    # Env gives reward = 1 even when episode terminates, this fixes the bug
    if terminated or truncated:
        reward = 0.0
    memory.store(observation, action, reward, next_observation)

In [30]:
memory.sample(3)

[Transition(observation=array([ 0.02976117, -0.03038454, -0.01162784, -0.0406074 ], dtype=float32), action=1, reward=1.0, next_observation=array([ 0.11333289,  0.9532838 , -0.16673163, -1.7010267 ], dtype=float32)),
 Transition(observation=array([ 0.02976117, -0.03038454, -0.01162784, -0.0406074 ], dtype=float32), action=0, reward=1.0, next_observation=array([ 0.05018069,  0.16634908, -0.05177746, -0.3692666 ], dtype=float32)),
 Transition(observation=array([ 0.02976117, -0.03038454, -0.01162784, -0.0406074 ], dtype=float32), action=0, reward=1.0, next_observation=array([ 0.08699369,  0.56024   , -0.11855218, -1.040774  ], dtype=float32))]

In [31]:
transitions = memory.sample(10)
batch_observations = torch.from_numpy(np.vstack([t.observation for t in transitions]))

In [32]:
batch_observations

tensor([[ 0.0298, -0.0304, -0.0116, -0.0406],
        [ 0.0298, -0.0304, -0.0116, -0.0406],
        [ 0.0298, -0.0304, -0.0116, -0.0406],
        [ 0.0298, -0.0304, -0.0116, -0.0406],
        [ 0.0298, -0.0304, -0.0116, -0.0406],
        [ 0.0298, -0.0304, -0.0116, -0.0406],
        [ 0.0298, -0.0304, -0.0116, -0.0406],
        [ 0.0298, -0.0304, -0.0116, -0.0406],
        [ 0.0298, -0.0304, -0.0116, -0.0406],
        [ 0.0298, -0.0304, -0.0116, -0.0406]])