# This file is for creating the benchmark, with stacked frames.

# install relevant packages

In [1]:
#!pip install 'gymnasium[atari]'
#!pip install 'gymnasium[accept-rom-license]'
#!pip install 'opencv-python'

# import relevant packages

In [2]:
import gymnasium as gym
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage
from stable_baselines3 import A2C, PPO
from stable_baselines3.common.callbacks import CheckpointCallback, EveryNTimesteps
from stable_baselines3.common.callbacks import EvalCallback
from utils import evaluate_policy
import wandb

2024-03-13 17:01:40.281772: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-13 17:01:40.281830: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-13 17:01:40.283129: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-13 17:01:40.289443: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Settings

In [3]:
wandb.login()
# cceb2653e8e4543a510e4c872213e68ea45cb706


[34m[1mwandb[0m: Currently logged in as: [33madicreson[0m ([33mfeature_extraction[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:

progress_bar = False
train_model = True
eval_model = True
save_name = "a2c_breakout_benchmark_framestack"


config = dict(
    env_id="ALE/Breakout-v5",
    algorithm='PPO',
    #Hyperparams
    policy="CnnPolicy",
    learning_rate='lin_2.5e-4',
    n_steps=128,
    batch_size=256,
    n_epochs=4,
    n_envs=8,
    n_timesteps=10_000_000.0,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range='lin_0.1',
    clip_range_vf=None,
    normalize_advantage=True,
    normalize=False,
    ent_coef=0.01,
    vf_coef=0.5,
    max_grad_norm=0.5,
    use_sde=False,
    sde_sample_freq=-1,
    rollout_buffer_class=None,
    rollout_buffer_kwargs=None,
    target_kl=None,
    stats_window_size=100,
    tensorboard_log=None,
    policy_kwargs=None,
    verbose=0,
    seed=None,
    device='auto',
    _init_setup_model=True,
    env_wrapper='stable_baselines3.common.atari_wrappers.AtariWrapper',
    frame_stack=4,
)

wandb.init(project=save_name, config=config)
config = wandb.config

# Create callbacks

In [4]:
# Every n steps.
# checkpoint_on_event = CheckpointCallback(save_freq=1, save_path="./logs/")
# callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event)
vec_eval_env = make_atari_env(config.env_id, n_envs=config.n_envs)
vec_eval_env = VecFrameStack(vec_eval_env, n_stack=config.frame_stack)
vec_eval_env = VecTransposeImage(vec_eval_env)

# Save best model
callback = EvalCallback(vec_eval_env, best_model_save_path="./logs/",
                             log_path="./logs/", eval_freq=500,
                             deterministic=True, render=False)


A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


# Create vectorized env and stack frames

In [5]:
vec_train_env = make_atari_env(config.env_id, n_envs=config.n_envs)
# Frame-stacking with 4 frames
vec_train_env = VecFrameStack(vec_train_env, n_stack=config.frame_stack)
vec_train_env = VecTransposeImage(vec_train_env)

# Create model, learn and save

In [6]:
if train_model:
    model = PPO(config.policy, vec_train_env, verbose=1)
    model.learn(total_timesteps=config.n_timesteps, callback=callback, progress_bar=progress_bar)
    model.save(save_name)

Using cpu device
Eval num_timesteps=2000, episode_reward=2.40 +/- 1.50
Episode length: 260.60 +/- 62.68
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 261      |
|    mean_reward        | 2.4      |
| time/                 |          |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.27    |
|    explained_variance | 0.928    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.0545   |
|    value_loss         | 0.0326   |
------------------------------------
New best mean reward!
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 236      |
|    ep_rew_mean     | 1.8      |
| time/              |          |
|    fps             | 164      |
|    iterations      | 100      |
|    time_elapsed    | 12       |
|    total_timesteps | 2000     |
---------------------------------
Eval num_timeste

# Load and evaluate Model

In [7]:
if eval_model:
    model = PPO.load("logs/best_model.zip", env=vec_eval_env)
    mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=2, render=False, fps=30)
    print(mean_reward, std_reward)
    