In [30]:
%pip install --upgrade pip setuptools ale-py
%pip install stable-baselines3
%pip install "gymnasium[accept-rom-license, atari]"

^C
Note: you may need to restart the kernel to use updated packages.
Collecting gymnasium<1.1.0,>=0.29.1 (from stable-baselines3)
  Using cached gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Using cached gymnasium-1.0.0-py3-none-any.whl (958 kB)
Installing collected packages: gymnasium
  Attempting uninstall: gymnasium
    Found existing installation: gymnasium 1.1.1
    Uninstalling gymnasium-1.1.1:
      Successfully uninstalled gymnasium-1.1.1
Successfully installed gymnasium-1.0.0
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.






In [1]:
import gymnasium as gym
import ale_py
gym.register_envs(ale_py)


# import gym
# from gym.wrappers import StepAPICompatibility

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

from collections import deque
import random
import matplotlib.pyplot as plt

from collections import deque
import random
import pickle
from tqdm import tqdm

In [6]:
use_gpu = True  # Set to False to force CPU usage

# Check if GPU is available
if torch.cuda.is_available() and use_gpu:
    device = torch.device("cuda:0")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    if use_gpu:
        print("GPU requested but not available. Using CPU instead.")
    else:
        print("Using CPU as requested.")

Using GPU: NVIDIA GeForce RTX 4070 Laptop GPU


## Environment Setup

In [7]:
acrobot_env_name = 'Acrobot-v1'
acrobot_env = gym.make(acrobot_env_name)
# acrobot_env = StepAPICompatibility(acrobot_env)
print("Action space:", acrobot_env.action_space)
print("State space:", acrobot_env.observation_space)

Action space: Discrete(3)
State space: Box([ -1.        -1.        -1.        -1.       -12.566371 -28.274334], [ 1.        1.        1.        1.       12.566371 28.274334], (6,), float32)


In [8]:
assault_env_name = "ALE/Assault-ram-v5"
assault_env = gym.make(assault_env_name)
# assault_env = StepAPICompatibility(assault_env)
print("Action space:", assault_env.action_space)
print("State space:", assault_env.observation_space)

Action space: Discrete(7)
State space: Box(0, 255, (128,), uint8)


## Q1 Expected SARSA

In [12]:
def softmax(x, temp):
    z = np.exp(x / temp - np.max(x / temp))
    return z / np.sum(z)


class QNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, device):
        super(QNetwork, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, output_dim),
        )

        self.mlp.apply(self.init_weights)

        self.device = device
        self.to(device)

    def init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.uniform_(m.weight, -0.001, 0.001)
            nn.init.uniform_(m.bias, -0.001, 0.001)

    def forward(self, x):
        return self.mlp(x)


class DeepValueLearning:
    def __init__(self, env, step_size, epsilon, algorithm, gamma=0.99):
        self.env = env
        self.step_size = step_size
        self.epsilon = epsilon
        self.gamma = gamma
        self.algorithm = algorithm
        self.n_actions = env.action_space.n
        self.state_dim = env.observation_space.shape[0]
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # self.device = "cpu"
        self.Q = QNetwork(self.state_dim, self.n_actions, self.device)
        self.optimizer = optim.SGD(self.Q.parameters(), lr=self.step_size)
        self.loss_fn = nn.MSELoss()

    def select_action(self, s):
        if np.random.uniform() < self.epsilon:
            return np.random.choice(self.n_actions)
        else:
            state_input = torch.tensor(s).float().unsqueeze(0).to(self.device)
            return torch.argmax(self.Q(state_input)).cpu().item()

    def update(
        self, state_batch, action_batch, reward_batch, next_state_batch, done_batch
    ):
        state_batch = torch.tensor(state_batch).float().to(self.device)
        action_batch = torch.tensor(action_batch).long().to(self.device)
        reward_batch = torch.tensor(reward_batch).float().to(self.device)
        next_state_batch = torch.tensor(next_state_batch).float().to(self.device)
        done_batch = torch.tensor(done_batch).bool().to(self.device)

        q_val_batch = (
            self.Q(state_batch).gather(1, action_batch.unsqueeze(1)).squeeze(1)
        )

        with torch.no_grad():
            next_q_val = self.Q(next_state_batch)
            greedy_next_q_val = next_q_val.max(1)[0]
            if self.algorithm == "Q-Learning":
                target_batch = torch.where(
                    done_batch,
                    reward_batch,
                    reward_batch + self.gamma * greedy_next_q_val,
                )
            else:
                random_next_q_val = next_q_val.mean(dim=1)
                exp_next_q_val = (
                    self.epsilon * random_next_q_val
                    + (1 - self.epsilon) * greedy_next_q_val
                )
                target_batch = torch.where(
                    done_batch, reward_batch, reward_batch + self.gamma * exp_next_q_val
                )

        loss = self.loss_fn(q_val_batch, target_batch)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [10]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, transition):
        self.buffer.append(transition)

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)

In [13]:
def run_trial(epsilon, step_size, seed, env, algorithm, use_buffer):
    torch.manual_seed(seed)
    np.random.seed(seed)

    agent = DeepValueLearning(env, step_size, epsilon, algorithm)

    if use_buffer:
        replay_buffer = ReplayBuffer(1_000_000)
        replay_minibatch_size = 128

    max_steps_per_episode = 500

    episode_rewards = []
    for _ in tqdm(range(1000)):
        state, _ = env.reset()
        done = False
        total_reward = 0
        n_steps = 0
        while not done:
            action = agent.select_action(state)
            next_state, reward, done, truncated, _ = env.step(action)
            done = done or truncated
            if use_buffer:
                replay_buffer.push((state, action, reward, next_state, done))

            total_reward += reward

            if use_buffer:
                if (
                    n_steps % replay_minibatch_size
                    and len(replay_buffer) > replay_minibatch_size
                ):
                    # if len(replay_buffer) > replay_minibatch_size:
                    transitions = replay_buffer.sample(replay_minibatch_size)
                    (
                        state_batch,
                        action_batch,
                        reward_batch,
                        next_state_batch,
                        done_batch,
                    ) = zip(*transitions)
                    agent.update(
                        state_batch,
                        action_batch,
                        reward_batch,
                        next_state_batch,
                        done_batch,
                    )
            else:
                agent.update([state], [action], [reward], [next_state], [done])

            state = next_state
            n_steps += 1
            if n_steps >= max_steps_per_episode:
                done = True

        episode_rewards.append(total_reward)

    return episode_rewards


epsilons = [0.01, 0.1, 0.5]
step_sizes = [1 / 4, 1 / 8, 1 / 16]
seeds = range(10)
envs = [acrobot_env, assault_env]
algorithms = ["Expected-SARSA", "Q-Learning"]

total_trials = (
    len(envs) * len(seeds) * len(epsilons) * len(step_sizes) * len(algorithms) * 2
)


# load pickle file for results
# check if file exists
try:
    with open("results.pkl", "rb") as f:
        results = pickle.load(f)
        trials_completed = len(results)
except:
    results = {}
    trials_completed = 0

for env in envs:
    env_name = env.env.spec.id
    for epsilon in epsilons:
        for step_size in step_sizes:
            for algorithm in algorithms:
                for use_buffer in [True, False]:
                    for seed in seeds:
                        print(
                            "Starting trial #", trials_completed + 1, "/", total_trials
                        )

                        episode_rewards = run_trial(
                            epsilon, step_size, seed, env, algorithm, use_buffer
                        )
                        results[
                            (env_name, seed, epsilon, step_size, algorithm, use_buffer)
                        ] = episode_rewards
                        with open("results.pkl", "wb") as f:
                            pickle.dump(results, f)
                        trials_completed += 1
                        print(f"Completed {trials_completed}/{total_trials} trials")

Starting trial # 1 / 720


100%|██████████| 1000/1000 [25:34<00:00,  1.53s/it]


Completed 1/720 trials
Starting trial # 2 / 720


100%|██████████| 1000/1000 [21:44<00:00,  1.30s/it]


Completed 2/720 trials
Starting trial # 3 / 720


100%|██████████| 1000/1000 [27:12<00:00,  1.63s/it]


Completed 3/720 trials
Starting trial # 4 / 720


100%|██████████| 1000/1000 [26:35<00:00,  1.60s/it]


Completed 4/720 trials
Starting trial # 5 / 720


100%|██████████| 1000/1000 [25:46<00:00,  1.55s/it]


Completed 5/720 trials
Starting trial # 6 / 720


 10%|█         | 102/1000 [02:30<22:08,  1.48s/it]


KeyboardInterrupt: 