<a href="https://colab.research.google.com/github/mishra39/deep_rl_course_hf/blob/main/unit8/ppo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Create a virtual display üîΩ

During the notebook, we'll need to generate a replay video. To do so, with colab, **we need to have a virtual screen to be able to render the environment** (and thus record the frames).

Hence the following cell will install the librairies and create and run a virtual screen üñ•

In [1]:
%%capture
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip3 install pyvirtualdisplay
!apt install x11-utils
!pip install pyglet

In [2]:
# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7cb8f39bf9e0>

In [3]:
# Display video
import glob
import io
import base64
from IPython.display import HTML

def show_video():
    """Embeds the recorded video in the notebook output."""
    mp4list = glob.glob('videos/*.mp4')
    if len(mp4list) > 0:
        video = mp4list[0]
        with io.open(video, 'r+b') as f:
            encoded = base64.b64encode(f.read()).decode()
        # Create an HTML display object for Colab
        return HTML(data=f'<video width="1000" controls><source src="data:video/mp4;base64,{encoded}" type="video/mp4" /></video>')
    else:
        print("No video files found in the 'videos' directory.")

### Install dependencies üîΩ

The first step is to install the dependencies, we‚Äôll install multiple ones:
- `gymnasium`
- `panda-gym`: Contains the robotics arm environments.
- `stable-baselines3`: The SB3 deep reinforcement learning library.
- `huggingface_sb3`: Additional code for Stable-baselines3 to load and upload models from the Hugging Face ü§ó Hub.
- `huggingface_hub`: Library allowing anyone to work with the Hub repositories.

‚è≤ The installation can **take 10 minutes**.

In [4]:
#!pip install stable-baselines3[extra]
!pip install gymnasium



In [5]:
# !pip install huggingface_sb3
# !pip install huggingface_hub
# !pip install panda_gym

## W&B Prerequisites

Install the W&B Python SDK and log in:

In [6]:
!pip install wandb -qU
!pip install -q gym numpy tensorboard
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [7]:
# Log in to your W&B account
import wandb
import random
import math

In [8]:
wandb.login()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 2


[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmishra39[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Setup

In [9]:
import argparse
import os
import random
import time
from distutils.util import strtobool

import gymnasium as gym  # Use gymnasium instead of gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter

## Arguments

In [10]:
from dataclasses import dataclass
from typing import Optional

@dataclass
class PPOConfig:
    # Experiment settings
    exp_name: str = "ppo_experiment"
    gym_id: str = "CartPole-v1"
    learning_rate: float = 2.5e-4
    seed: int = 1
    total_timesteps: int = 25000
    torch_deterministic: bool = True
    cuda: bool = True
    track: bool = False
    wandb_project_name: str = "ppo-implementation-details"
    wandb_entity: Optional[str] = None
    capture_video: bool = False

    # Algorithm specific arguments
    num_envs: int = 4
    num_steps: int = 128
    anneal_lr: bool = True
    gae: bool = True
    gamma: float = 0.99
    gae_lambda: float = 0.95
    num_minibatches: int = 4
    update_epochs: int = 4
    norm_adv: bool = True
    clip_coef: float = 0.2
    clip_vloss: bool = True
    ent_coef: float = 0.01
    vf_coef: float = 0.5
    max_grad_norm: float = 0.5
    target_kl: Optional[float] = None

    def __post_init__(self):
        # Computed values
        self.batch_size = int(self.num_envs * self.num_steps)
        self.minibatch_size = int(self.batch_size // self.num_minibatches)

# Create instance with default values
# args = PPOConfig()

# Or customize specific values
# args = PPOConfig(learning_rate=1e-3, num_envs=8, total_timesteps=50000)

# print(f"Batch size: {args.batch_size}")
# print(f"Minibatch size: {args.minibatch_size}")

## Gym Envrionment

In [27]:
def make_env(gym_id, seed, idx, capture_video, run_name):
    def thunk():
      env = gym.make(gym_id, render_mode="rgb_array" if capture_video else None)
      env = gym.wrappers.RecordEpisodeStatistics(env)
      if capture_video:
            if idx == 0:
              env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
              # disable_logger=True) # added to avoid conflict with wandb logger
      env.action_space.seed(seed)
      env.observation_space.seed(seed)
      return env

    return thunk

## Layer Initialization

In [12]:
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
  torch.nn.init.orthogonal_(layer.weight, std)
  torch.nn.init.constant_(layer.bias, bias_const)
  return layer

## Main Loop

In [38]:
args = PPOConfig(track=True, capture_video=True) # Changed capture_video to True
run_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
if args.track:
    wandb.init(
        project=args.wandb_project_name,
        entity=args.wandb_entity,
        sync_tensorboard=True,
        config=vars(args),
        name=run_name,
        monitor_gym=False, # Set to False to prevent conflict with RecordVideo
        save_code=True,
    )
writer = SummaryWriter(f"runs/{run_name}")
writer.add_text(
    "hyperparameters",
    "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
)

# TRY NOT TO MODIFY: seeding
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic

device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")

0,1
charts/SPS,‚ñÖ‚ñÖ‚ñÜ‚ñá‚ñá‚ñá‚ñà‚ñà‚ñá‚ñá‚ñà‚ñá‚ñÜ‚ñÖ‚ñÑ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñá
charts/episodic_length,‚ñÇ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÅ‚ñÇ‚ñÇ‚ñÑ‚ñÇ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñà‚ñÅ‚ñÇ‚ñÅ‚ñÉ‚ñÇ‚ñÅ‚ñÜ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÖ‚ñÉ‚ñÇ‚ñÑ‚ñÉ‚ñÉ
charts/episodic_return,‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÇ‚ñÅ‚ñÇ‚ñÅ‚ñÇ‚ñÇ‚ñÉ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÑ‚ñÇ‚ñÅ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÑ‚ñÜ‚ñÇ‚ñà‚ñÅ‚ñÉ‚ñÉ‚ñÅ‚ñÉ‚ñÖ‚ñÉ‚ñÇ‚ñÉ‚ñÑ‚ñÖ‚ñÇ
charts/learning_rate,‚ñà‚ñà‚ñà‚ñà‚ñá‚ñá‚ñÜ‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñá‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÖ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ
global_step,‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñà
losses/approx_kl,‚ñÖ‚ñÑ‚ñÉ‚ñÇ‚ñÖ‚ñà‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÇ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÉ‚ñÉ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
losses/clipfrac,‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
losses/entropy,‚ñà‚ñà‚ñà‚ñà‚ñá‚ñÜ‚ñá‚ñÜ‚ñÖ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÇ‚ñÇ‚ñÅ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÅ‚ñÉ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÅ‚ñÇ‚ñÇ‚ñÉ
losses/explained_variance,‚ñÇ‚ñÉ‚ñÇ‚ñÉ‚ñÇ‚ñÅ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÉ‚ñÇ‚ñÇ‚ñÉ‚ñÑ‚ñÇ‚ñÉ‚ñÑ‚ñÉ‚ñá‚ñÖ‚ñÉ‚ñà‚ñà‚ñÜ‚ñÖ‚ñÑ‚ñÇ‚ñÜ‚ñÇ‚ñÉ‚ñÜ‚ñÑ‚ñÑ
losses/old_approx_kl,‚ñÑ‚ñÉ‚ñÉ‚ñÅ‚ñÉ‚ñÖ‚ñÜ‚ñÖ‚ñÑ‚ñà‚ñÉ‚ñÜ‚ñÅ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÑ‚ñÖ‚ñÖ‚ñÉ‚ñÑ‚ñÑ‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ

0,1
charts/SPS,1564
charts/episodic_length,129
charts/episodic_return,129
charts/learning_rate,1e-05
global_step,24576
losses/approx_kl,0.0
losses/clipfrac,0
losses/entropy,0.61483
losses/explained_variance,0.03128
losses/old_approx_kl,-1e-05




In [39]:
# env setup
envs = gym.vector.SyncVectorEnv(
    [make_env(args.gym_id, args.seed + i, i, args.capture_video, run_name) for i in range(args.num_envs)]
)
obs, infos = envs.reset()

  IMAGEMAGICK_BINARY = r"C:\Program Files\ImageMagick-6.8.8-Q16\magick.exe"


## Agent Setup

In [40]:
class Agent(nn.Module):
  def __init__(self, envs):
    super(Agent, self).__init__()

    '''
    - Estimates the Value Function $V(s)$. This is a scalar prediction of the total expected reward an agent will receive starting from state s.

    - Tanh is often preferred in PPO (and standard implementations like CleanRL) because it produces smoother gradients. Since the Critic is trying to map states to a continuous value, a smooth activation function helps the Advantage calculation stay stable.

    - Notice std=1. In PPO, initializing the last layer of the critic with a standard deviation of 1 is a common practice to ensure the initial value estimates aren't near zero, helping the policy gradients have a meaningful "baseline" to compare against immediately

    - In PPO, the Critic's job is to reduce variance'''

    self.critic = nn.Sequential(
        layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
        nn.Tanh(),
        layer_init(nn.Linear(64,64)),
        nn.Tanh(),
        layer_init(nn.Linear(64,1), std=1),
    )

    self.actor = nn.Sequential(
        layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
        nn.Tanh(),
        layer_init(nn.Linear(64,64)),
        nn.Tanh(),
        layer_init(nn.Linear(64, envs.single_action_space.n), std=0.01), # smaller std dev ensures similar values for all actions -> probability is similar for picking each action at the beginning
    )

  def get_value(self, x):
    return self.critic(x)

  def get_action_and_value(self, x, action=None):
    logits = self.actor(x)
    probs = Categorical(logits=logits)
    if action is None:
      action = probs.sample()
    return action, probs.log_prob(action), probs.entropy(), self.critic(x)

## Training

In [41]:
agent = Agent(envs).to(device)
optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5)

# ALGO Logic: Storage setup
obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device)
actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device)
logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device)
rewards = torch.zeros_like(logprobs).to(device)
dones = torch.zeros_like(logprobs).to(device)
values = torch.zeros_like(logprobs).to(device)

# TRY NOT TO MODIFY: start the game
global_step = 0
next_obs, _ = envs.reset()
next_obs = torch.Tensor(next_obs).to(device)
next_done = torch.zeros(args.num_envs).to(device)
num_updates = args.total_timesteps // args.batch_size
print(f"total_timesteps: {args.total_timesteps}")
print(f"batch_size: {args.batch_size}")
print(f"num_updates: {num_updates}")

total_timesteps: 25000
batch_size: 512
num_updates: 48


## Training Loop

In [42]:
start_time = time.time()
for update in range(1, num_updates + 1):
  # lr annealing
  if args.anneal_lr:
    frac = 1.0 - (update - 1.0) / num_updates
    lrnow = frac * args.learning_rate
    optimizer.param_groups[0]["lr"] = lrnow
    writer.add_scalar("charts/learning_rate", lrnow, global_step)
    if args.track:
            wandb.log({
                "charts/learning_rate": lrnow,
            }, step=global_step)

  # policy rollout
  for step in range(0, args.num_steps):
    global_step += 1 * args.num_envs
    obs[step] = next_obs
    dones[step] = next_done

    # Algo Logic: Action logic
    with torch.no_grad():
      action, log_prob, _, value = agent.get_action_and_value(next_obs)
      values[step] = value.squeeze(-1) # Corrected line: squeeze the last dimension
    actions[step] = action
    logprobs[step] = log_prob

    # TRY NOT TO MODIFY: execute the game and log data.
    next_obs, reward, terminated, truncated, info = envs.step(action.cpu().numpy())
    done = np.logical_or(terminated, truncated)  # Combine terminated and truncated into done

    rewards[step] = torch.tensor(reward).to(device).view(-1)
    next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device)

    # Log episodic returns when episodes finish
    if "episode" in info and info["_episode"].any():
      finished_indices = np.where(info["_episode"])[0]

      for idx in finished_indices:
        episodic_return = info["episode"]["r"][idx]
        episodic_length = info["episode"]["l"][idx]

        print(f"global_step={global_step}, episodic_return={episodic_return}")
        writer.add_scalar("charts/episodic_return", episodic_return, global_step)
        writer.add_scalar("charts/episodic_length", episodic_length, global_step)

        # Optionally log to wandb
        if args.track:
            wandb.log({
                "charts/episodic_return": episodic_return,
                "charts/episodic_length": episodic_length,
            }, step=global_step)
            # Log video to wandb for the first environment if capture_video is true
            if args.capture_video and idx == 0:
                video_files = glob.glob(f"videos/{run_name}*.mp4")
                if video_files:
                    # Sort by modification time to get the latest video
                    latest_video_file = max(video_files, key=os.path.getmtime)
                    wandb.log({"video": wandb.Video(latest_video_file, fps=4, format="mp4")}, step=global_step)

        break  # Log only the first finished episode per step

  # bootstrap value if not done
  with torch.no_grad():
    next_value = agent.get_value(next_obs).reshape(1,-1)
    if args.gae:
      advantages = torch.zeros_like(rewards).to(device)
      lastgaelam = 0
      for t in reversed(range(args.num_steps)):
          if t == args.num_steps - 1:
              nextnonterminal = 1.0 - next_done
              nextvalues = next_value
          else:
              nextnonterminal = 1.0 - dones[t + 1]
              nextvalues = values[t + 1]
          delta = rewards[t] + nextvalues * nextnonterminal * args.gamma - values[t] # Corrected line
          advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam
      returns = advantages + values
    else:
      returns = torch.zeros_like(rewards).to(device)
      for t in reversed(range(args.num_steps)):
          if t == args.num_steps - 1:
              nextnonterminal = 1.0 - next_done
              next_return = next_value
          else:
              nextnonterminal = 1.0 - dones[t + 1]
              next_return = returns[t + 1]
          returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
      advantages = returns - values
  # flatten the batch
  b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
  b_logprobs = logprobs.reshape(-1)
  b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
  b_advantages = advantages.reshape(-1)
  b_returns = returns.reshape(-1)
  b_values = values.reshape(-1)

  # Optimizing the policy and value network
  b_inds = np.arange(args.batch_size) # 512
  clipfracs = []
  for epoch in range(args.update_epochs):
    np.random.shuffle(b_inds)
    for start in range(0, args.batch_size, args.minibatch_size):
      end = start + args.minibatch_size
      mb_inds = b_inds[start:end]
      _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions.long()[mb_inds])
      logratio = newlogprob - b_logprobs[mb_inds]
      ratio = logratio.exp()

      with torch.no_grad():
        # calculate approx_kl http://joschu.net/blog/kl-approx.html
        old_approx_kl = (-logratio).mean()
        approx_kl = ((ratio - 1) - logratio).mean()
        clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]

      mb_advantages = b_advantages[mb_inds]
      if args.norm_adv:
        mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)

      # Policy loss
      pg_loss1 = -mb_advantages * ratio
      pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_coef, 1 + args.clip_coef)
      pg_loss = torch.max(pg_loss1, pg_loss2).mean() # Fixed line: added .mean() here

      # Value loss
      newvalue = newvalue.view(-1)
      if args.clip_vloss:
        v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
        v_clipped = b_values[mb_inds] + torch.clamp(
                        newvalue - b_values[mb_inds],
                        -args.clip_coef,
                        args.clip_coef,
                    )
        v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
        v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
        v_loss = 0.5 * v_loss_max.mean()
      else:
        v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()

      # Entropy loss
      entropy_loss = entropy.mean()
      # Overall loss
      loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef

      # Backprop
      optimizer.zero_grad()
      loss.backward()
      # Clip gradient
      nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
      optimizer.step()

    if args.target_kl is not None:
      if approx_kl > args.target_kl:
        break
  y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
  var_y = np.var(y_true)
  explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y

  # TRY NOT TO MODIFY: record rewards for plotting purposes
  writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
  writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
  writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step)
  writer.add_scalar("losses/entropy", entropy_loss.item(), global_step)
  writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step)
  writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
  writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
  writer.add_scalar("losses/explained_variance", explained_var, global_step)
  print("SPS:", int(global_step / (time.time() - start_time)))
  writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)

envs.close()
writer.close()

global_step=60, episodic_return=15.0
global_step=68, episodic_return=17.0
global_step=84, episodic_return=21.0
global_step=100, episodic_return=25.0
global_step=152, episodic_return=16.0
global_step=164, episodic_return=25.0
global_step=176, episodic_return=26.0
global_step=220, episodic_return=29.0
global_step=224, episodic_return=11.0
global_step=268, episodic_return=25.0
global_step=308, episodic_return=20.0
global_step=336, episodic_return=16.0
global_step=356, episodic_return=50.0
global_step=376, episodic_return=16.0
global_step=396, episodic_return=9.0
global_step=448, episodic_return=17.0
global_step=464, episodic_return=31.0
global_step=468, episodic_return=17.0
global_step=492, episodic_return=10.0
SPS: 581
global_step=520, episodic_return=13.0
global_step=556, episodic_return=15.0
global_step=564, episodic_return=10.0
global_step=568, episodic_return=86.0
global_step=576, episodic_return=13.0
global_step=644, episodic_return=16.0
global_step=648, episodic_return=20.0
global_

AttributeError: 'RecordVideo' object has no attribute 'enabled'