QR DQN

Logan Wong

law3082

In [1]:
# REMINDER: make sure you set
# Runtime
# Change runtime type
# T4 GPU

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/MECE689_Bowling/MECE689_RL_Bowling_Atari
!ls -la

/content/drive/MyDrive/MECE689_Bowling/MECE689_RL_Bowling_Atari
total 29
drwx------ 2 root root 4096 Sep 25 15:17 checkpoints
drwx------ 2 root root 4096 Sep 25 15:17 code
drwx------ 2 root root 4096 Sep 25 15:04 .git
-rw------- 1 root root 6087 Sep 30 00:04 github_terminal.ipynb
-rw------- 1 root root   33 Sep 26 19:25 .gitignore
drwx------ 2 root root 4096 Sep 25 15:17 models
-rw------- 1 root root 2348 Sep 29 02:21 README.md
drwx------ 2 root root 4096 Sep 25 15:17 results


In [13]:
!pip install gymnasium[atari,accept-rom-license] ale-py sb3_contrib stable-baselines3



In [14]:
import os
import torch
import gymnasium as gym
import stable_baselines3
import ale_py
import numpy as np
from sb3_contrib import QRDQN


# Visualization
from PIL import Image
import io
import base64
from IPython.display import display, HTML



# For debugging
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.callbacks import BaseCallback
import time

# Action masking
from gymnasium import ActionWrapper
from stable_baselines3.common.atari_wrappers import AtariWrapper

# Vector environment
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack, VecEnvWrapper, DummyVecEnv


import gc

print("All imports working")

All imports working


In [15]:
print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

GPU available: True
GPU: Tesla T4


In [16]:
def convert(seconds):
    seconds = seconds % (24 * 3600)
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60

    return "%d:%02d:%02d" % (hour, minutes, seconds)

In [17]:
# # Show the first frame as an image
# def show_frame(obs):
#     # Convert to PIL Image and display
#     img = Image.fromarray(obs)
#     display(img)

Create Environment and Model

In [18]:
# Action space: Discrete(6)
# Number of actions: 6
# Action meanings: ['NOOP', 'FIRE', 'UP', 'DOWN', 'UPFIRE', 'DOWNFIRE']
# Observation shape: (210, 160, 3)
# Observation space: Box(0, 255, (210, 160, 3), uint8)

# ACTIONS:
# 0: NOOP
# 1: FIRE
# 2: UP
# 3: DOWN

# 4: UPFIRE
# 5: DOWNFIRE

In [19]:
class ActionReducer(ActionWrapper):
  def __init__(self, env):
    super().__init__(env)

    # NOOP, FIRE, UP, and DOWN only. No UPFIRE. No DOWNFIRE.
    self.allowed_actions = [0,1,2,3]

    self.action_space = gym.spaces.Discrete(len(self.allowed_actions))

  def action(self, action):
    return self.allowed_actions[action]

In [20]:
def make_env():
  # DQN only supports single environments (not vectorized)
  env = gym.make("ALE/Bowling-v5")
  env = ActionReducer(env)
  env = Monitor(env)
  # disable reward clipping
  env = AtariWrapper(env, clip_reward=False)
  return env

In [21]:
seed = 316
torch.manual_seed(seed)

env = DummyVecEnv([make_env])
env = VecFrameStack(env, n_stack=4)

model = QRDQN(
    "CnnPolicy",
    env,
    learning_rate=0.0001,
    buffer_size=50000,
    batch_size=32,
    gamma=0.99,
    target_update_interval=1000,
    train_freq=4,
    gradient_steps=1,
    exploration_final_eps=0.02,
    exploration_fraction=0.1,
    learning_starts=10000,
    verbose=1,
    device="cuda"
)

print("QR-DQN model created")

Using cuda device
Wrapping the env in a VecTransposeImage.
QR-DQN model created


  return datetime.utcnow().replace(tzinfo=utc)


In [22]:
# # Confirm number of possible actions is smaller now
# print(f"Original action space: {gym.make('ALE/Bowling-v5').action_space}")
# print(f"Reduced action space: {env.action_space}")

Train the model

In [23]:
class SimpleCheckpointCallback(BaseCallback):
    def __init__(self, save_freq, save_path, name_prefix):
        super().__init__()
        self.save_freq = save_freq
        self.save_path = save_path
        self.name_prefix = name_prefix

    def _on_step(self):
        if self.n_calls % self.save_freq == 0:
            path = os.path.join(self.save_path, f"{self.name_prefix}_{self.n_calls}")
            self.model.save(path)
            print(f"Checkpoint saved at step {self.n_calls}")
        return True

In [None]:
total_timesteps = 10000000    # 10M
# total_timesteps = 5000000    # 5M
# total_timesteps =  2000000    # 2M
# total_timesteps =  1000000    # 1M
# total_timesteps =   100000    # 100K
# total_timesteps =    10000    # 10K
# total_timesteps =     5000    # 5K

# Save couple of steps
checkpoint_callback = CheckpointCallback(
    save_freq=100000,
    save_path="/content/drive/MyDrive/MECE689_Bowling/MECE689_RL_Bowling_Atari/checkpoints",
    name_prefix=f"qr_dqn_{total_timesteps}"
)

# Time how long it takes
print("Training started")
start_time = time.time()

# log_interval: Print train metrics every 10 episodes
model.learn(
    total_timesteps=total_timesteps,
    callback=checkpoint_callback,
    progress_bar=True,
    log_interval=10
)
end_time = time.time()
print("Training done")

env.close()

# Calculate run time
training_duration = end_time - start_time
time_in_minutes_and_seconds = convert(training_duration)
print(f"Time taken: {time_in_minutes_and_seconds}")
# print(f"Speed: {total_timesteps/training_duration:.2f} steps/second")

# Save model to Google Drive
trained_model_save_path = f"/content/drive/MyDrive/MECE689_Bowling/MECE689_RL_Bowling_Atari/models/qr_dqn{total_timesteps}"
model.save(trained_model_save_path)
print("Model saved to Google Drive")

Output()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 46.1     |
|    n_updates        | 1473290  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.21e+03 |
|    ep_rew_mean      | 105      |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 9310     |
|    fps              | 184      |
|    time_elapsed     | 32036    |
|    total_timesteps  | 5908472  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 49.5     |
|    n_updates        | 1474617  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.21e+03 |
|    ep_rew_mean      | 105      |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 9

In [None]:
# torch.cuda.empty_cache()
# del model
# del env