# This file is for creating the benchmark, with stacked frames.

# import relevant packages

In [1]:
import wandb

from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage

from wandb.integration.sb3 import WandbCallback

from feature_extraction.callbacks.wandb_eval_callback import WandbEvalCallback
from feature_extraction.callbacks.wandb_on_training_end_callback import WandbOnTrainingEndCallback
from utils import evaluate_policy, linear_schedule
from collections import OrderedDict


2024-03-21 17:07:23.722639: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-21 17:07:23.722760: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-21 17:07:23.725488: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-21 17:07:23.740475: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Settings

In [2]:
progress_bar = True
train_model = True
eval_model = False
save_name = "breakout_benchmark"
log_dir = "logs"


# Setup Wandb

In [3]:
wandb.login()

config = OrderedDict([
    # Environment settings
    ('env_id', "ALE/Breakout-v5"),
    ('n_envs', 8),
    ('env_wrapper', ['stable_baselines3.common.atari_wrappers.AtariWrapper']),
    ('frame_stack', 4),
    ('training_seed', 12),
    ('evaluation_seed', 14),
    
    # Algorithm and policy
    ('algo', 'PPO'),
    ('policy', 'CnnPolicy'),
    
    # Training hyperparameters
    ('batch_size', 256),
    ('n_steps', 128),
    ('n_epochs', 4),
    ('n_timesteps', 2_000   ),
    ('learning_rate', 0.00025),
    ('learning_rate_schedule', 'linear'),
    ('clip_range', 0.1),
    ('clip_range_schedule', 'linear'),
    ('ent_coef', 0.01),
    ('vf_coef', 0.5),
    ('normalize_advantage', False),
    
    # Evaluation and logging
    ('n_eval_episodes', 5),
    ('record_n_episodes', 10),
    ('log_frequency', 200),
    
    # Other settings
    ('verbose', 1)
])

wandb.init(
    project=save_name,
    config=config,
    sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
)


config = wandb.config

[34m[1mwandb[0m: Currently logged in as: [33madicreson[0m ([33mfeature_extraction[0m). Use [1m`wandb login --relogin`[0m to force relogin


# Create Evaluation Environment

In [4]:
vec_eval_env = make_atari_env(config.env_id, n_envs=config.n_envs, seed=config.evaluation_seed)
vec_eval_env = VecFrameStack(vec_eval_env, n_stack=config.frame_stack)
vec_eval_env = VecTransposeImage(vec_eval_env)


A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


# Create Training Environment

In [5]:
vec_train_env = make_atari_env(config.env_id, n_envs=config.n_envs, seed=config.training_seed)
vec_train_env = VecFrameStack(vec_train_env, n_stack=config.frame_stack)
vec_train_env = VecTransposeImage(vec_train_env)

# Create Model

In [6]:
# Define the keys for PPO-specific hyperparameters
ppo_params_keys = [
    'batch_size',
    'ent_coef',
    'n_epochs',
    'n_steps',
    'policy',
    'vf_coef',
    'normalize_advantage',
]

# Filter the config dictionary to extract only the PPO hyperparameters
ppo_hyperparams = {key: config[key] for key in ppo_params_keys if key in config}

# Additional hyperparameters not in the initial filter that require custom handling
learning_rate_schedule = linear_schedule(2.5e-4)
clip_range_schedule = linear_schedule(0.1)

# Instantiate the PPO model with the specified hyperparameters and environment
model = PPO(
    **ppo_hyperparams,
    learning_rate=learning_rate_schedule,
    clip_range=clip_range_schedule,
    env=vec_train_env, 
    verbose=1,
    tensorboard_log=f"{log_dir}/tensorboard",
)


Using cpu device


# Create Callbacks

In [7]:
# Callback for logging evaluation metrics
wandb_eval_callback = WandbEvalCallback()
# Save best model
eval_callback = EvalCallback(
    eval_env=vec_eval_env,
    eval_freq=max(config.log_frequency // config.n_envs, 1),
    n_eval_episodes=config.n_eval_episodes,
    best_model_save_path=log_dir,
    log_path=log_dir,
    callback_after_eval=wandb_eval_callback,
    deterministic=True,
    render=False,
    verbose=0
)


# Needs to be changed, so it uses run instead of wandb
wandb_callback = WandbCallback(
    gradient_save_freq=config.log_frequency,
    verbose=1,
)

wandb_on_training_end_callback = WandbOnTrainingEndCallback(
    model=model,
    eval_env=vec_eval_env,
    log_dir=log_dir,
    n_eval_episodes=config.n_eval_episodes,
    record_n_episodes=config.record_n_episodes,
)

callbacks = [wandb_callback]

# Train Model with callbacks

In [8]:
model.learn(
    total_timesteps=config.n_timesteps,
    callback=callbacks,
)   

Logging to logs/tensorboard/PPO_4


2024-03-21 17:07:32.448585: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-21 17:07:32.449058: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-21 17:07:32.450495: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-21 17:07:32.461369: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 232      |
|    ep_rew_mean     | 1.7      |
| time/              |          |
|    fps             | 207      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 1024     |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 225           |
|    ep_rew_mean          | 1.59          |
| time/                   |               |
|    fps                  | 189           |
|    iterations           | 2             |
|    time_elapsed         | 10            |
|    total_timesteps      | 2048          |
| train/                  |               |
|    approx_kl            | 0.00041484192 |
|    clip_fraction        | 0.0986        |
|    clip_range           | 0.0488        |
|    entropy_loss         | -1.39         |
|    explained_variance   | 0.0136        |


<stable_baselines3.ppo.ppo.PPO at 0x7b77b5c13c70>

# Cleanup

In [9]:
wandb.finish()

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁█
rollout/ep_len_mean,█▁
rollout/ep_rew_mean,█▁
time/fps,█▁
train/approx_kl,▁
train/clip_fraction,▁
train/clip_range,▁
train/entropy_loss,▁
train/explained_variance,▁
train/learning_rate,▁

0,1
global_step,2048.0
rollout/ep_len_mean,225.22449
rollout/ep_rew_mean,1.59184
time/fps,189.0
train/approx_kl,0.00041
train/clip_fraction,0.09863
train/clip_range,0.0488
train/entropy_loss,-1.38598
train/explained_variance,0.01358
train/learning_rate,0.00012
