Evaluate QR DQN

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/MECE689_Bowling/MECE689_RL_Bowling_Atari
!ls -la

/content/drive/MyDrive/MECE689_Bowling/MECE689_RL_Bowling_Atari
total 30
drwx------ 2 root root 4096 Sep 25 15:17 checkpoints
drwx------ 2 root root 4096 Sep 25 15:17 code
drwx------ 2 root root 4096 Sep 25 15:04 .git
-rw------- 1 root root 6200 Oct 13 19:29 github_terminal.ipynb
-rw------- 1 root root   33 Sep 26 19:25 .gitignore
drwx------ 2 root root 4096 Sep 25 15:17 models
-rw------- 1 root root 2348 Sep 29 02:21 README.md
drwx------ 2 root root 4096 Sep 25 15:17 results


In [6]:
!pip install gymnasium[atari,accept-rom-license] ale-py sb3_contrib stable-baselines3

Collecting sb3_contrib
  Downloading sb3_contrib-2.7.0-py3-none-any.whl.metadata (4.1 kB)
Downloading sb3_contrib-2.7.0-py3-none-any.whl (93 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.2/93.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sb3_contrib
Successfully installed sb3_contrib-2.7.0


In [7]:
import os
import torch
import gymnasium as gym
import stable_baselines3
import ale_py
import numpy as np
from sb3_contrib import QRDQN


# Visualization
from PIL import Image
import io
import base64
from IPython.display import display, HTML



# For debugging
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.callbacks import BaseCallback
import time

# Action masking
from gymnasium import ActionWrapper
from stable_baselines3.common.atari_wrappers import AtariWrapper

# Vector environment
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack, VecEnvWrapper, DummyVecEnv


import gc

print("All imports working")

All imports working


In [8]:
print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

GPU available: True
GPU: Tesla T4


In [9]:
def convert(seconds):
    seconds = seconds % (24 * 3600)
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60

    return "%d:%02d:%02d" % (hour, minutes, seconds)

Create Environment

In [10]:
class ActionReducer(ActionWrapper):
  def __init__(self, env):
    super().__init__(env)

    # NOOP, FIRE, UP, and DOWN only. No UPFIRE. No DOWNFIRE.
    self.allowed_actions = [0,1,2,3]

    self.action_space = gym.spaces.Discrete(len(self.allowed_actions))

  def action(self, action):
    return self.allowed_actions[action]

In [11]:
def make_env():
  # DQN only supports single environments (not vectorized)
  env = gym.make("ALE/Bowling-v5")
  env = ActionReducer(env)
  env = Monitor(env)
  # disable reward clipping
  env = AtariWrapper(env, clip_reward=False)
  return env

In [12]:
seed = 316
torch.manual_seed(seed)

env = DummyVecEnv([make_env])
env = VecFrameStack(env, n_stack=4)


Load Model

In [14]:
# Current working directory:
# /content/drive/MyDrive/MECE689_Bowling/MECE689_RL_Bowling_Atari

# Load model
model_name = "qr_dqn_10000000"

model = QRDQN.load(
    f"models/{model_name}",
    env=env,
    device="cuda"
)

print("Model loaded")

Wrapping the env in a VecTransposeImage.
Model loaded


  return datetime.utcnow().replace(tzinfo=utc)


Evaluate Model

In [15]:
all_rewards = []
all_lengths = []
# total_episodes = 10
# total_episodes = 100
total_episodes = 10000   # 10K

print("Running evaluation of trained QR DQN agent")
for episode in range(total_episodes):
    obs = env.reset()
    done = False
    total_reward = 0
    steps = 0

    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)

        total_reward += reward[0]
        steps += 1

    # Record
    all_rewards.append(total_reward)
    all_lengths.append(steps)

    # print(type(total_reward))
    # print(total_reward)
    print(f"Episode {episode+1}: Reward = {total_reward:6.1f}, Steps = {steps}")

# NOTE: this is faster than training bc no back prop is done, so no gradients need to be calculated

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Episode 5001: Reward =   89.0, Steps = 537
Episode 5002: Reward =   88.0, Steps = 535
Episode 5003: Reward =   88.0, Steps = 548
Episode 5004: Reward =   88.0, Steps = 548
Episode 5005: Reward =   87.0, Steps = 570
Episode 5006: Reward =   88.0, Steps = 535
Episode 5007: Reward =   88.0, Steps = 536
Episode 5008: Reward =   89.0, Steps = 537
Episode 5009: Reward =   87.0, Steps = 567
Episode 5010: Reward =   82.0, Steps = 565
Episode 5011: Reward =   87.0, Steps = 566
Episode 5012: Reward =   85.0, Steps = 549
Episode 5013: Reward =   89.0, Steps = 555
Episode 5014: Reward =   83.0, Steps = 548
Episode 5015: Reward =   85.0, Steps = 547
Episode 5016: Reward =   87.0, Steps = 567
Episode 5017: Reward =   87.0, Steps = 565
Episode 5018: Reward =   88.0, Steps = 550
Episode 5019: Reward =   88.0, Steps = 548
Episode 5020: Reward =   86.0, Steps = 548
Episode 5021: Reward =   89.0, Steps = 553
Episode 5022: Reward =   87.0, S

In [16]:
env.close()

In [17]:
seed = 316
torch.manual_seed(seed)

env = DummyVecEnv([make_env])
env = VecFrameStack(env, n_stack=4)

In [18]:
all_random_rewards = []
all_random_lengths = []

print("Running evaluation of random agent")
for episode in range(total_episodes):
    obs = env.reset()
    done = False
    total_reward = 0
    steps = 0

    while not done:
      # agent picks a random action
      action = env.action_space.sample()
      # The action needs to be a list for the vectorized environment
      obs, reward, done, info = env.step([action])

      total_reward += reward[0]
      steps += 1

    all_random_rewards.append(total_reward)
    all_random_lengths.append(steps)

    print(f"Episode {episode+1}: Reward = {total_reward:6.1f}, Steps = {steps}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Episode 5001: Reward =   60.0, Steps = 535
Episode 5002: Reward =   29.0, Steps = 564
Episode 5003: Reward =   36.0, Steps = 569
Episode 5004: Reward =   31.0, Steps = 547
Episode 5005: Reward =   36.0, Steps = 578
Episode 5006: Reward =   13.0, Steps = 581
Episode 5007: Reward =   24.0, Steps = 581
Episode 5008: Reward =   34.0, Steps = 564
Episode 5009: Reward =   41.0, Steps = 567
Episode 5010: Reward =   18.0, Steps = 541
Episode 5011: Reward =    9.0, Steps = 544
Episode 5012: Reward =   25.0, Steps = 543
Episode 5013: Reward =   26.0, Steps = 524
Episode 5014: Reward =   31.0, Steps = 562
Episode 5015: Reward =   30.0, Steps = 595
Episode 5016: Reward =   18.0, Steps = 559
Episode 5017: Reward =   34.0, Steps = 565
Episode 5018: Reward =   26.0, Steps = 525
Episode 5019: Reward =   32.0, Steps = 584
Episode 5020: Reward =   28.0, Steps = 562
Episode 5021: Reward =   47.0, Steps = 562
Episode 5022: Reward =   22.0, S

In [19]:
env.close()

Evaluate Performance

In [20]:
# Calculate metrics
rewards_array = np.array(all_rewards)
lengths_array = np.array(all_lengths)

print("Evaluation metrics of TRAINED DQN:")
print(f"Total episodes: {total_episodes}")
print(f"Mean reward: {np.mean(rewards_array):.2f}")
print(f"Median reward: {np.median(rewards_array):.2f}")
print(f"Min reward: {np.min(rewards_array):.2f}")
print(f"Max reward: {np.max(rewards_array):.2f}")
print(f"Standard deviation: {np.std(rewards_array):.2f}")
print(f"Average episode length: {np.mean(lengths_array):.1f} steps")

Evaluation metrics of TRAINED DQN:
Total episodes: 10000
Mean reward: 87.24
Median reward: 88.00
Min reward: 77.00
Max reward: 95.00
Standard deviation: 1.69
Average episode length: 551.1 steps


In [21]:
# Calculate metrics
random_rewards_array = np.array(all_random_rewards)
random_lengths_array = np.array(all_random_lengths)

print("Evaluation metrics of RANDOM AGENT:")
print(f"Total episodes: {total_episodes}")
print(f"Mean reward: {np.mean(random_rewards_array):.2f}")
print(f"Median reward: {np.median(random_rewards_array):.2f}")
print(f"Min reward: {np.min(random_rewards_array):.2f}")
print(f"Max reward: {np.max(random_rewards_array):.2f}")
print(f"Standard deviation: {np.std(random_rewards_array):.2f}")
print(f"Average episode length: {np.mean(random_lengths_array):.1f} steps")

Evaluation metrics of RANDOM AGENT:
Total episodes: 10000
Mean reward: 27.90
Median reward: 27.00
Min reward: 3.00
Max reward: 93.00
Standard deviation: 9.41
Average episode length: 568.0 steps


In [22]:
def calculate_hns(agent_score, random_score, human_score):
  # Calculate Human Normalized Score (HNS)
  num = agent_score - random_score
  denom = human_score - random_score
  hns = num / denom
  return hns


def calculate_hwrns(agent_score, random_score, world_record_score):
  # Calculate Human World Record Normalized Score (HWRNS)
  num = agent_score - random_score
  denom = world_record_score - random_score
  hwrns = num / denom
  return hwrns

In [23]:
all_hns_values = []
all_hwrns_values = []

# HARD CODED VALUES:
# average human baseline
human_score = 161.0
# Max score is 300 via Perfect Game aka 12 strikes in a row
world_record_score = 300

for i in range(total_episodes):
  agent_score = all_rewards[i]
  random_score = all_random_rewards[i]

  hns = calculate_hns(agent_score, random_score, human_score)
  print(f"Episode {i+1}, HNS:   {(hns):.4f}")
  all_hns_values.append(hns)

  hwrns = calculate_hwrns(agent_score, random_score, world_record_score)
  print(f"Episode {i+1}, HWRNS: {(hwrns):.4f}\n")
  all_hwrns_values.append(hwrns)

Episode 1, HNS:   0.4419
Episode 1, HWRNS: 0.2127

Episode 2, HNS:   0.3455
Episode 2, HWRNS: 0.1526

Episode 3, HNS:   0.4122
Episode 3, HWRNS: 0.2000

Episode 4, HNS:   0.4062
Episode 4, HWRNS: 0.1948

Episode 5, HNS:   0.4427
Episode 5, HWRNS: 0.2148

Episode 6, HNS:   0.4825
Episode 6, HWRNS: 0.2447

Episode 7, HNS:   0.4504
Episode 7, HWRNS: 0.2185

Episode 8, HNS:   0.3871
Episode 8, HWRNS: 0.1825

Episode 9, HNS:   0.4194
Episode 9, HWRNS: 0.1977

Episode 10, HNS:   0.3036
Episode 10, HWRNS: 0.1355

Episode 11, HNS:   0.4127
Episode 11, HWRNS: 0.1962

Episode 12, HNS:   0.4748
Episode 12, HWRNS: 0.2374

Episode 13, HNS:   0.4714
Episode 13, HWRNS: 0.2366

Episode 14, HNS:   0.3393
Episode 14, HWRNS: 0.1514

Episode 15, HNS:   0.4264
Episode 15, HWRNS: 0.2052

Episode 16, HNS:   0.4857
Episode 16, HWRNS: 0.2437

Episode 17, HNS:   0.5034
Episode 17, HWRNS: 0.2604

Episode 18, HNS:   0.4899
Episode 18, HWRNS: 0.2535

Episode 19, HNS:   0.4297
Episode 19, HWRNS: 0.2060

Episode 20,

In [24]:
all_hns_vals_array = np.array(all_hns_values)

print("Evaluating HNS values:")
print(f"Mean HNS: {np.mean(all_hns_vals_array):.4f}")
print(f"Median HNS: {np.median(all_hns_vals_array):.4f}")
print(f"Min HNS: {np.min(all_hns_vals_array):.4f}")
print(f"Max HNS: {np.max(all_hns_vals_array):.4f}")
print(f"Standard deviation: {np.std(all_hns_vals_array):.4f}")

Evaluating HNS values:
Mean HNS: 0.4429
Median HNS: 0.4478
Min HNS: -0.0588
Max HNS: 0.5443
Standard deviation: 0.0435


In [25]:
all_hwrns_vals_array = np.array(all_hwrns_values)

print("Evaluating HWRNS values:")
print(f"Mean HWRNS: {np.mean(all_hwrns_vals_array):.4f}")
print(f"Median HWRNS: {np.median(all_hwrns_vals_array):.4f}")
print(f"Min HWRNS: {np.min(all_hwrns_vals_array):.4f}")
print(f"Max HWRNS: {np.max(all_hwrns_vals_array):.4f}")
print(f"Standard deviation: {np.std(all_hwrns_vals_array):.4f}")

Evaluating HWRNS values:
Mean HWRNS: 0.2171
Median HWRNS: 0.2198
Min HWRNS: -0.0193
Max HWRNS: 0.2896
Standard deviation: 0.0283
