# This file is for creating the benchmark, with stacked frames.

# install relevant packages

In [1]:
#!pip install 'gymnasium[atari]'
#!pip install 'gymnasium[accept-rom-license]'
#!pip install 'opencv-python'

# import relevant packages

In [2]:
import gymnasium as gym
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage
from stable_baselines3 import A2C, PPO
from stable_baselines3.common.callbacks import CheckpointCallback, EveryNTimesteps
from stable_baselines3.common.callbacks import EvalCallback
from feature_extraction.callbacks.wandb_reward_logging_callback import WandbRewardLoggingCallback
from utils import evaluate_policy
import wandb

2024-03-14 20:10:22.055105: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-14 20:10:22.055145: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-14 20:10:22.056413: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-14 20:10:22.062893: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Settings

In [3]:
progress_bar = False
train_model = True
eval_model = True
save_name = "a2c_breakout_benchmark_framestack"

# Login to wanb and create a project with config

In [4]:
wandb.login()

config = dict(
    env_id="ALE/Breakout-v5",
    algorithm='PPO',
    #Hyperparams
    policy="CnnPolicy",
    learning_rate=2.5e-4,
    n_steps=128,
    batch_size=256,
    n_epochs=4,
    n_envs=8,
    n_timesteps=10_000,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.1,
    clip_range_vf=None,
    normalize_advantage=True,
    normalize=False,
    ent_coef=0.01,
    vf_coef=0.5,
    max_grad_norm=0.5,
    use_sde=False,
    sde_sample_freq=-1,
    rollout_buffer_class=None,
    rollout_buffer_kwargs=None,
    target_kl=None,
    stats_window_size=100,
    tensorboard_log=None,
    policy_kwargs=None,
    verbose=0,
    seed=None,
    device='auto',
    _init_setup_model=True,
    env_wrapper='stable_baselines3.common.atari_wrappers.AtariWrapper',
    frame_stack=4,
)

wandb.init(project=save_name, config=config)
config = wandb.config

[34m[1mwandb[0m: Currently logged in as: [33madicreson[0m ([33mfeature_extraction[0m). Use [1m`wandb login --relogin`[0m to force relogin


# Create callbacks

In [5]:
vec_eval_env = make_atari_env(config.env_id, n_envs=config.n_envs)
vec_eval_env = VecFrameStack(vec_eval_env, n_stack=config.frame_stack)
vec_eval_env = VecTransposeImage(vec_eval_env)

# WandbCallback
wandb_callback_after_eval = WandbRewardLoggingCallback()

# Save best model
eval_callback = EvalCallback(vec_eval_env, best_model_save_path="./logs/",
                             log_path="./logs/", eval_freq=max(500 // config.n_envs, 1), callback_after_eval=wandb_callback_after_eval,
                             deterministic=True, render=False)


A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


# Create vectorized env and stack frames

In [6]:
vec_train_env = make_atari_env(config.env_id, n_envs=config.n_envs)
# Frame-stacking with 4 frames
vec_train_env = VecFrameStack(vec_train_env, n_stack=config.frame_stack)
vec_train_env = VecTransposeImage(vec_train_env)

# Create model, learn and save with wandb

In [7]:
if train_model:
    ppo_params_keys = [
        'policy', 'learning_rate', 'n_steps', 'batch_size', 'n_epochs',
        'gamma', 'gae_lambda', 'clip_range', 'clip_range_vf', 'normalize_advantage',
        'ent_coef', 'vf_coef', 'max_grad_norm', 'use_sde', 'sde_sample_freq',
        'rollout_buffer_class', 'rollout_buffer_kwargs', 'target_kl',
        'stats_window_size', 'tensorboard_log', 'policy_kwargs', 'verbose',
        'seed', 'device', '_init_setup_model'
    ]   
    
    # Step 2: Filter the config dictionary to extract only the hyperparameters for PPO
    ppo_hyperparams = {key: config[key] for key in ppo_params_keys if key in config}
    
    # Step 3: Unpack the filtered hyperparameters dictionary into the PPO constructor
    model = PPO(**ppo_hyperparams, env=vec_train_env)
    
    model = PPO(config.policy, vec_train_env, verbose=1)
    model.learn(total_timesteps=config.n_timesteps, callback=eval_callback, progress_bar=progress_bar)
    model.save(save_name)

Using cpu device
Eval num_timesteps=496, episode_reward=2.20 +/- 0.40
Episode length: 251.60 +/- 20.91
---------------------------------
| eval/              |          |
|    mean_ep_length  | 252      |
|    mean_reward     | 2.2      |
| time/              |          |
|    total_timesteps | 496      |
---------------------------------
New best mean reward!
Eval num_timesteps=992, episode_reward=2.20 +/- 0.40
Episode length: 251.20 +/- 23.42
---------------------------------
| eval/              |          |
|    mean_ep_length  | 251      |
|    mean_reward     | 2.2      |
| time/              |          |
|    total_timesteps | 992      |
---------------------------------
Eval num_timesteps=1488, episode_reward=1.20 +/- 0.98
Episode length: 217.00 +/- 34.29
---------------------------------
| eval/              |          |
|    mean_ep_length  | 217      |
|    mean_reward     | 1.2      |
| time/              |          |
|    total_timesteps | 1488     |
----------------------

# Load and evaluate Model

In [8]:
if eval_model:
    model = PPO.load("logs/best_model.zip", env=vec_eval_env)
    mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=2, render=False, fps=30)
    print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")
    

Running evaluation
1.5 1.5


# Wrap up

In [9]:
wandb.finish()

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/mean_reward,▆▆▂▅▃▆▄█▅▆▆▂▆▄▄▄▃▅▅▃▅▆▆▃▃▇▃▄▇▄▄▄▁

0,1
eval/mean_reward,1.0
