[download this notebook here](https://github.com/HumanCompatibleAI/imitation/blob/master/docs/tutorials/4_train_airl.ipynb)
# Train an Agent using Adversarial Inverse Reinforcement Learning

As usual, we first need an expert. Again, we download one from the HuggingFace model hub for convenience.

Note that we now use a variant of the CartPole environment from the seals package, which has fixed episode durations. Read more about why we do this [here](https://imitation.readthedocs.io/en/latest/getting-started/variable-horizon.html).

In [1]:
import numpy as np
from imitation.policies.serialize import load_policy
from imitation.util.util import make_vec_env
from imitation.data.wrappers import RolloutInfoWrapper

SEED = 42

FAST = False

if FAST:
    N_RL_TRAIN_STEPS = 100_000
else:
    N_RL_TRAIN_STEPS = 2_000_000

venv = make_vec_env(
    "seals:seals/CartPole-v0",
    rng=np.random.default_rng(SEED),
    n_envs=8,
    post_wrappers=[
        lambda env, _: RolloutInfoWrapper(env)
    ],  # needed for computing rollouts later
)
expert = load_policy(
    "ppo-huggingface",
    organization="HumanCompatibleAI",
    env_name="seals/CartPole-v0",
    venv=venv,
)

Exception: code() argument 13 must be str, not int
Exception: code() argument 13 must be str, not int
Exception: code() argument 13 must be str, not int


Visualization of Expert Policy:

In [2]:
import gymnasium as gym
import os
from IPython.display import display, clear_output
import matplotlib.pyplot as plt

render_mode = "human" if os.environ.get('DISPLAY') else "rgb_array"
env = gym.make("seals:seals/CartPole-v0", render_mode=render_mode)

# Gymnasium returns (obs, info) from reset(); unpack so `obs` is the raw observation
obs, info = env.reset()

for _ in range(1000):
    # Pass only the observation (not the (obs, info) tuple) to the SB3 policy
    action, _ = expert.predict(obs, deterministic=True)

    # Gymnasium step returns (obs, reward, terminated, truncated, info)
    obs, rew, terminated, truncated, info = env.step(action)

    # Render depending on mode. For `rgb_array` we collect and display frames inside the notebook.
    if render_mode == "rgb_array":
        frame = env.render()
        # display every 10 frames to avoid excessive output
        if (_ % 10) == 0:
            clear_output(wait=True)
            plt.imshow(frame)
            plt.axis('off')
            display(plt.gcf())
    else:
        env.render()   # opens a window (when DISPLAY is available)

    # episode end when either terminated or truncated
    if terminated or truncated:
        obs, info = env.reset()
        # if running in rgb_array mode, break after one episode to show the frames
        if render_mode == "rgb_array":
            break

env.close()

  from pkg_resources import resource_stream, resource_exists


We generate some expert trajectories, that the discriminator needs to distinguish from the learner's trajectories.

In [3]:
from imitation.data import rollout

rollouts = rollout.rollout(
    expert,
    venv,
    rollout.make_sample_until(min_timesteps=None, min_episodes=60),
    rng=np.random.default_rng(SEED),
)

Visualization of sampled Expert Trajectories:

In [4]:
# Exact playback from saved observations (CartPole-specific).
# This will set the environment internal state from each saved observation and render it,
# avoiding stepping actions so visuals match the saved rollouts exactly where possible.
import os
import time
import numpy as np
import gymnasium as gym
from IPython.display import display, clear_output
import matplotlib.pyplot as plt

# Ensure rollouts exist
try:
    trajs = rollouts if hasattr(rollouts, '__len__') else [rollouts]
except NameError:
    raise RuntimeError('`rollouts` not found — run the rollout generation cell first')

render_mode = "human" if os.environ.get('DISPLAY') else "rgb_array"
n_play = min(5, len(trajs))
print(f'Playing {n_play} sampled expert rollouts exactly from saved observations (render_mode={render_mode})')

for i in range(n_play):
    traj = trajs[i]
    # Make a fresh env for each trajectory
    env = gym.make("seals:seals/CartPole-v0", render_mode=render_mode)
    env.reset()

    # Check if we can set internal state directly (CartPole exposes `env.unwrapped.state`).
    can_set_state = hasattr(env, 'unwrapped') and hasattr(env.unwrapped, 'state')
    if not can_set_state:
        print('Warning: env does not support setting internal state; exact playback not possible. Falling back to stepping actions.')

    if can_set_state:
        # traj.obs is length T+1; render each saved observation in sequence
        for t, obs_state in enumerate(traj.obs):
            st = np.asarray(obs_state).copy()
            try:
                env.unwrapped.state = st
            except Exception as e:
                print('Could not assign env.unwrapped.state, falling back to stepping. Error:', e)
                can_set_state = False
                break
            if render_mode == 'rgb_array':
                frame = env.render()
                clear_output(wait=True)
                plt.imshow(frame)
                plt.axis('off')
                display(plt.gcf())
            else:
                env.render()
            # small delay so notebook updates are visible
            time.sleep(0.02)

    if not can_set_state:
        # Fallback: step the saved actions into the env (may diverge from original)
        env.reset()
        for act in traj.acts:
            a = np.asarray(act)
            if a.shape == ():
                a = a.item()
            obs, rew, terminated, truncated, info = env.step(a)
            if render_mode == 'rgb_array':
                frame = env.render()
                clear_output(wait=True)
                plt.imshow(frame)
                plt.axis('off')
                display(plt.gcf())
            else:
                env.render()
            if terminated or truncated:
                break

    env.close()
    # brief pause between trajectories
    if render_mode == 'rgb_array':
        time.sleep(0.5)

print('Done playing exact rollouts')

Playing 5 sampled expert rollouts exactly from saved observations (render_mode=human)


KeyboardInterrupt: 

Now we are ready to set up our AIRL trainer.
Note, that the `reward_net` is actually the network of the discriminator.
We evaluate the learner before and after training so we can see if it made any progress.

Initialization of Learner policy (AIRL Generator):

In [5]:
from imitation.algorithms.adversarial.airl import AIRL
from imitation.rewards.reward_nets import BasicShapedRewardNet
from imitation.util.networks import RunningNorm
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy


learner = PPO(
    env=venv,
    policy=MlpPolicy,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0005,
    gamma=0.95,
    clip_range=0.1,
    vf_coef=0.1,
    n_epochs=5,
    seed=SEED,
)


Visualization of untrained Learner Policy (AIRL Generator):

In [6]:
render_mode = "human" if os.environ.get('DISPLAY') else "rgb_array"
env = gym.make("seals:seals/CartPole-v0", render_mode=render_mode)

# Gymnasium returns (obs, info) from reset(); unpack so `obs` is the raw observation
obs, info = env.reset()

for _ in range(1000):
    # Pass only the observation (not the (obs, info) tuple) to the SB3 policy
    action, _ = learner.predict(obs, deterministic=True)

    # Gymnasium step returns (obs, reward, terminated, truncated, info)
    obs, rew, terminated, truncated, info = env.step(action)

    # Render depending on mode. For `rgb_array` we collect and display frames inside the notebook.
    if render_mode == "rgb_array":
        frame = env.render()
        # display every 10 frames to avoid excessive output
        if (_ % 10) == 0:
            clear_output(wait=True)
            plt.imshow(frame)
            plt.axis('off')
            display(plt.gcf())
    else:
        env.render()   # opens a window (when DISPLAY is available)

    # episode end when either terminated or truncated
    if terminated or truncated:
        obs, info = env.reset()
        # if running in rgb_array mode, break after one episode to show the frames
        if render_mode == "rgb_array":
            break

env.close()

KeyboardInterrupt: 

AIRL Training:

In [7]:
reward_net = BasicShapedRewardNet(
    observation_space=venv.observation_space,
    action_space=venv.action_space,
    normalize_input_layer=RunningNorm,
)
airl_trainer = AIRL(
    demonstrations=rollouts,
    demo_batch_size=2048,
    gen_replay_buffer_capacity=512,
    n_disc_updates_per_round=16,
    venv=venv,
    gen_algo=learner,
    reward_net=reward_net,
)

venv.seed(SEED)
learner_rewards_before_training, _ = evaluate_policy(
    learner, venv, 100, return_episode_rewards=True
)
airl_trainer.train(N_RL_TRAIN_STEPS)
venv.seed(SEED)
learner_rewards_after_training, _ = evaluate_policy(
    learner, venv, 100, return_episode_rewards=True
)

round:   0%|          | 0/122 [00:00<?, ?it/s]

------------------------------------------
| raw/                        |          |
|    gen/rollout/ep_len_mean  | 500      |
|    gen/rollout/ep_rew_mean  | 34.4     |
|    gen/time/fps             | 7603     |
|    gen/time/iterations      | 1        |
|    gen/time/time_elapsed    | 2        |
|    gen/time/total_timesteps | 16384    |
------------------------------------------
--------------------------------------------------
| raw/                                |          |
|    disc/disc_acc                    | 0.505    |
|    disc/disc_acc_expert             | 1        |
|    disc/disc_acc_gen                | 0.0103   |
|    disc/disc_entropy                | 0.663    |
|    disc/disc_loss                   | 0.74     |
|    disc/disc_proportion_expert_pred | 0.995    |
|    disc/disc_proportion_expert_true | 0.5      |
|    disc/global_step                 | 1        |
|    disc/n_expert                    | 2.05e+03 |
|    disc/n_generated                 | 2.05e+03 |
-

round:   1%|          | 1/122 [00:04<08:50,  4.39s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 36.7         |
|    gen/rollout/ep_rew_wrapped_mean | -572         |
|    gen/time/fps                    | 7824         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 32768        |
|    gen/train/approx_kl             | 0.0009234178 |
|    gen/train/clip_fraction         | 0.026        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.692       |
|    gen/train/explained_variance    | -0.0146      |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 4.4          |
|    gen/train/n_updates             | 5            |
|    gen/train/policy_gradient_loss  | -0.000137    |
|    gen/train/value_loss   

round:   2%|▏         | 2/122 [00:08<08:39,  4.33s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 35.3         |
|    gen/rollout/ep_rew_wrapped_mean | -441         |
|    gen/time/fps                    | 7791         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 49152        |
|    gen/train/approx_kl             | 0.0023373594 |
|    gen/train/clip_fraction         | 0.108        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.689       |
|    gen/train/explained_variance    | 0.802        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0926       |
|    gen/train/n_updates             | 10           |
|    gen/train/policy_gradient_loss  | -0.0027      |
|    gen/train/value_loss   

round:   2%|▏         | 3/122 [00:12<08:26,  4.26s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 35.6         |
|    gen/rollout/ep_rew_wrapped_mean | -438         |
|    gen/time/fps                    | 7770         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 65536        |
|    gen/train/approx_kl             | 0.0022240467 |
|    gen/train/clip_fraction         | 0.116        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.687       |
|    gen/train/explained_variance    | 0.904        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0987       |
|    gen/train/n_updates             | 15           |
|    gen/train/policy_gradient_loss  | -0.00371     |
|    gen/train/value_loss   

round:   3%|▎         | 4/122 [00:17<08:22,  4.26s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 35.2         |
|    gen/rollout/ep_rew_wrapped_mean | -375         |
|    gen/time/fps                    | 7828         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 81920        |
|    gen/train/approx_kl             | 0.0024276492 |
|    gen/train/clip_fraction         | 0.11         |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.684       |
|    gen/train/explained_variance    | 0.937        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0504       |
|    gen/train/n_updates             | 20           |
|    gen/train/policy_gradient_loss  | -0.00517     |
|    gen/train/value_loss   

round:   4%|▍         | 5/122 [00:21<08:14,  4.23s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 36.7         |
|    gen/rollout/ep_rew_wrapped_mean | -451         |
|    gen/time/fps                    | 7794         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 98304        |
|    gen/train/approx_kl             | 0.0027766335 |
|    gen/train/clip_fraction         | 0.164        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.679       |
|    gen/train/explained_variance    | 0.833        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.141        |
|    gen/train/n_updates             | 25           |
|    gen/train/policy_gradient_loss  | -0.00623     |
|    gen/train/value_loss   

round:   5%|▍         | 6/122 [00:25<08:07,  4.21s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 41.4         |
|    gen/rollout/ep_rew_wrapped_mean | -495         |
|    gen/time/fps                    | 7803         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 114688       |
|    gen/train/approx_kl             | 0.0030214149 |
|    gen/train/clip_fraction         | 0.183        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.675       |
|    gen/train/explained_variance    | 0.836        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0388       |
|    gen/train/n_updates             | 30           |
|    gen/train/policy_gradient_loss  | -0.00651     |
|    gen/train/value_loss   

round:   6%|▌         | 7/122 [00:29<08:07,  4.24s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 43.6         |
|    gen/rollout/ep_rew_wrapped_mean | -521         |
|    gen/time/fps                    | 7903         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 131072       |
|    gen/train/approx_kl             | 0.0033448143 |
|    gen/train/clip_fraction         | 0.211        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.668       |
|    gen/train/explained_variance    | 0.903        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0441       |
|    gen/train/n_updates             | 35           |
|    gen/train/policy_gradient_loss  | -0.0107      |
|    gen/train/value_loss   

round:   7%|▋         | 8/122 [00:33<08:00,  4.21s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 46.3        |
|    gen/rollout/ep_rew_wrapped_mean | -556        |
|    gen/time/fps                    | 7807        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 147456      |
|    gen/train/approx_kl             | 0.004198321 |
|    gen/train/clip_fraction         | 0.285       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.66       |
|    gen/train/explained_variance    | 0.93        |
|    gen/train/learning_rate         | 0.0005      |
|    gen/train/loss                  | 0.0392      |
|    gen/train/n_updates             | 40          |
|    gen/train/policy_gradient_loss  | -0.0147     |
|    gen/train/value_loss            | 0.991  

round:   7%|▋         | 9/122 [00:38<07:53,  4.19s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 48           |
|    gen/rollout/ep_rew_wrapped_mean | -572         |
|    gen/time/fps                    | 7848         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 163840       |
|    gen/train/approx_kl             | 0.0045685414 |
|    gen/train/clip_fraction         | 0.314        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.653       |
|    gen/train/explained_variance    | 0.944        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0356       |
|    gen/train/n_updates             | 45           |
|    gen/train/policy_gradient_loss  | -0.016       |
|    gen/train/value_loss   

round:   8%|▊         | 10/122 [00:42<07:52,  4.22s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 50.8        |
|    gen/rollout/ep_rew_wrapped_mean | -587        |
|    gen/time/fps                    | 7761        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 180224      |
|    gen/train/approx_kl             | 0.004641582 |
|    gen/train/clip_fraction         | 0.32        |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.647      |
|    gen/train/explained_variance    | 0.957       |
|    gen/train/learning_rate         | 0.0005      |
|    gen/train/loss                  | 0.086       |
|    gen/train/n_updates             | 50          |
|    gen/train/policy_gradient_loss  | -0.0163     |
|    gen/train/value_loss            | 1.04   

round:   9%|▉         | 11/122 [00:46<07:46,  4.20s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 54          |
|    gen/rollout/ep_rew_wrapped_mean | -590        |
|    gen/time/fps                    | 7797        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 196608      |
|    gen/train/approx_kl             | 0.004029328 |
|    gen/train/clip_fraction         | 0.258       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.64       |
|    gen/train/explained_variance    | 0.963       |
|    gen/train/learning_rate         | 0.0005      |
|    gen/train/loss                  | 0.0641      |
|    gen/train/n_updates             | 55          |
|    gen/train/policy_gradient_loss  | -0.0145     |
|    gen/train/value_loss            | 0.959  

round:  10%|▉         | 12/122 [00:50<07:40,  4.18s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 56.1        |
|    gen/rollout/ep_rew_wrapped_mean | -604        |
|    gen/time/fps                    | 7825        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 212992      |
|    gen/train/approx_kl             | 0.004188104 |
|    gen/train/clip_fraction         | 0.236       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.632      |
|    gen/train/explained_variance    | 0.952       |
|    gen/train/learning_rate         | 0.0005      |
|    gen/train/loss                  | 0.0839      |
|    gen/train/n_updates             | 60          |
|    gen/train/policy_gradient_loss  | -0.011      |
|    gen/train/value_loss            | 1.08   

round:  11%|█         | 13/122 [00:54<07:38,  4.20s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 58.6        |
|    gen/rollout/ep_rew_wrapped_mean | -638        |
|    gen/time/fps                    | 7867        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 229376      |
|    gen/train/approx_kl             | 0.003712315 |
|    gen/train/clip_fraction         | 0.208       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.628      |
|    gen/train/explained_variance    | 0.964       |
|    gen/train/learning_rate         | 0.0005      |
|    gen/train/loss                  | 0.0738      |
|    gen/train/n_updates             | 65          |
|    gen/train/policy_gradient_loss  | -0.00833    |
|    gen/train/value_loss            | 0.915  

round:  11%|█▏        | 14/122 [00:59<07:32,  4.19s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 63.8        |
|    gen/rollout/ep_rew_wrapped_mean | -688        |
|    gen/time/fps                    | 7820        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 245760      |
|    gen/train/approx_kl             | 0.004045919 |
|    gen/train/clip_fraction         | 0.206       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.62       |
|    gen/train/explained_variance    | 0.974       |
|    gen/train/learning_rate         | 0.0005      |
|    gen/train/loss                  | 0.0645      |
|    gen/train/n_updates             | 70          |
|    gen/train/policy_gradient_loss  | -0.00788    |
|    gen/train/value_loss            | 0.901  

round:  12%|█▏        | 15/122 [01:03<07:28,  4.19s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 69.8         |
|    gen/rollout/ep_rew_wrapped_mean | -730         |
|    gen/time/fps                    | 7732         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 262144       |
|    gen/train/approx_kl             | 0.0036164187 |
|    gen/train/clip_fraction         | 0.171        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.612       |
|    gen/train/explained_variance    | 0.98         |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0701       |
|    gen/train/n_updates             | 75           |
|    gen/train/policy_gradient_loss  | -0.00549     |
|    gen/train/value_loss   

round:  13%|█▎        | 16/122 [01:07<07:28,  4.23s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 88.3         |
|    gen/rollout/ep_rew_wrapped_mean | -735         |
|    gen/time/fps                    | 7795         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 278528       |
|    gen/train/approx_kl             | 0.0039891414 |
|    gen/train/clip_fraction         | 0.146        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.597       |
|    gen/train/explained_variance    | 0.982        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0572       |
|    gen/train/n_updates             | 80           |
|    gen/train/policy_gradient_loss  | -0.00424     |
|    gen/train/value_loss   

round:  14%|█▍        | 17/122 [01:11<07:24,  4.23s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 107          |
|    gen/rollout/ep_rew_wrapped_mean | -729         |
|    gen/time/fps                    | 7732         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 294912       |
|    gen/train/approx_kl             | 0.0035532098 |
|    gen/train/clip_fraction         | 0.133        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.586       |
|    gen/train/explained_variance    | 0.982        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0376       |
|    gen/train/n_updates             | 85           |
|    gen/train/policy_gradient_loss  | -0.00364     |
|    gen/train/value_loss   

round:  15%|█▍        | 18/122 [01:16<07:20,  4.24s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 130          |
|    gen/rollout/ep_rew_wrapped_mean | -727         |
|    gen/time/fps                    | 7753         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 311296       |
|    gen/train/approx_kl             | 0.0028050137 |
|    gen/train/clip_fraction         | 0.132        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.578       |
|    gen/train/explained_variance    | 0.979        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0415       |
|    gen/train/n_updates             | 90           |
|    gen/train/policy_gradient_loss  | -0.00135     |
|    gen/train/value_loss   

round:  16%|█▌        | 19/122 [01:20<07:19,  4.27s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 145          |
|    gen/rollout/ep_rew_wrapped_mean | -740         |
|    gen/time/fps                    | 7818         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 327680       |
|    gen/train/approx_kl             | 0.0032455113 |
|    gen/train/clip_fraction         | 0.147        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.569       |
|    gen/train/explained_variance    | 0.985        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0765       |
|    gen/train/n_updates             | 95           |
|    gen/train/policy_gradient_loss  | -0.00292     |
|    gen/train/value_loss   

round:  16%|█▋        | 20/122 [01:24<07:13,  4.25s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 170          |
|    gen/rollout/ep_rew_wrapped_mean | -757         |
|    gen/time/fps                    | 7740         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 344064       |
|    gen/train/approx_kl             | 0.0040733693 |
|    gen/train/clip_fraction         | 0.174        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.556       |
|    gen/train/explained_variance    | 0.988        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0475       |
|    gen/train/n_updates             | 100          |
|    gen/train/policy_gradient_loss  | -0.00421     |
|    gen/train/value_loss   

round:  17%|█▋        | 21/122 [01:28<07:06,  4.23s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 182          |
|    gen/rollout/ep_rew_wrapped_mean | -760         |
|    gen/time/fps                    | 7877         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 360448       |
|    gen/train/approx_kl             | 0.0030095784 |
|    gen/train/clip_fraction         | 0.129        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.539       |
|    gen/train/explained_variance    | 0.987        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.237        |
|    gen/train/n_updates             | 105          |
|    gen/train/policy_gradient_loss  | -0.00231     |
|    gen/train/value_loss   

round:  18%|█▊        | 22/122 [01:33<07:03,  4.24s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 197          |
|    gen/rollout/ep_rew_wrapped_mean | -800         |
|    gen/time/fps                    | 7893         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 376832       |
|    gen/train/approx_kl             | 0.0027293628 |
|    gen/train/clip_fraction         | 0.126        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.538       |
|    gen/train/explained_variance    | 0.981        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.222        |
|    gen/train/n_updates             | 110          |
|    gen/train/policy_gradient_loss  | -0.00342     |
|    gen/train/value_loss   

round:  19%|█▉        | 23/122 [01:37<06:57,  4.22s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 203          |
|    gen/rollout/ep_rew_wrapped_mean | -838         |
|    gen/time/fps                    | 7944         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 393216       |
|    gen/train/approx_kl             | 0.0032307925 |
|    gen/train/clip_fraction         | 0.15         |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.525       |
|    gen/train/explained_variance    | 0.991        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.131        |
|    gen/train/n_updates             | 115          |
|    gen/train/policy_gradient_loss  | -0.00459     |
|    gen/train/value_loss   

round:  20%|█▉        | 24/122 [01:41<06:50,  4.19s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 228         |
|    gen/rollout/ep_rew_wrapped_mean | -906        |
|    gen/time/fps                    | 7511        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 409600      |
|    gen/train/approx_kl             | 0.003686633 |
|    gen/train/clip_fraction         | 0.179       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.513      |
|    gen/train/explained_variance    | 0.993       |
|    gen/train/learning_rate         | 0.0005      |
|    gen/train/loss                  | 0.327       |
|    gen/train/n_updates             | 120         |
|    gen/train/policy_gradient_loss  | -0.00697    |
|    gen/train/value_loss            | 2.22   

round:  20%|██        | 25/122 [01:45<06:47,  4.20s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 249          |
|    gen/rollout/ep_rew_wrapped_mean | -910         |
|    gen/time/fps                    | 7911         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 425984       |
|    gen/train/approx_kl             | 0.0037332843 |
|    gen/train/clip_fraction         | 0.171        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.478       |
|    gen/train/explained_variance    | 0.995        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.104        |
|    gen/train/n_updates             | 125          |
|    gen/train/policy_gradient_loss  | -0.00415     |
|    gen/train/value_loss   

round:  21%|██▏       | 26/122 [01:49<06:42,  4.19s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 279          |
|    gen/rollout/ep_rew_wrapped_mean | -897         |
|    gen/time/fps                    | 7916         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 442368       |
|    gen/train/approx_kl             | 0.0037163077 |
|    gen/train/clip_fraction         | 0.168        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.469       |
|    gen/train/explained_variance    | 0.995        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.22         |
|    gen/train/n_updates             | 130          |
|    gen/train/policy_gradient_loss  | -0.00603     |
|    gen/train/value_loss   

round:  22%|██▏       | 27/122 [01:53<06:36,  4.17s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 500        |
|    gen/rollout/ep_rew_mean         | 306        |
|    gen/rollout/ep_rew_wrapped_mean | -842       |
|    gen/time/fps                    | 7517       |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 2          |
|    gen/time/total_timesteps        | 458752     |
|    gen/train/approx_kl             | 0.00365252 |
|    gen/train/clip_fraction         | 0.154      |
|    gen/train/clip_range            | 0.1        |
|    gen/train/entropy_loss          | -0.443     |
|    gen/train/explained_variance    | 0.995      |
|    gen/train/learning_rate         | 0.0005     |
|    gen/train/loss                  | 0.271      |
|    gen/train/n_updates             | 135        |
|    gen/train/policy_gradient_loss  | -0.00319   |
|    gen/train/value_loss            | 1.71       |
------------

round:  23%|██▎       | 28/122 [01:58<06:34,  4.20s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 334          |
|    gen/rollout/ep_rew_wrapped_mean | -776         |
|    gen/time/fps                    | 7918         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 475136       |
|    gen/train/approx_kl             | 0.0030289693 |
|    gen/train/clip_fraction         | 0.129        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.41        |
|    gen/train/explained_variance    | 0.993        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.095        |
|    gen/train/n_updates             | 140          |
|    gen/train/policy_gradient_loss  | -0.00224     |
|    gen/train/value_loss   

round:  24%|██▍       | 29/122 [02:02<06:28,  4.18s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 346         |
|    gen/rollout/ep_rew_wrapped_mean | -692        |
|    gen/time/fps                    | 7913        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 491520      |
|    gen/train/approx_kl             | 0.003505771 |
|    gen/train/clip_fraction         | 0.139       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.402      |
|    gen/train/explained_variance    | 0.996       |
|    gen/train/learning_rate         | 0.0005      |
|    gen/train/loss                  | 0.0827      |
|    gen/train/n_updates             | 145         |
|    gen/train/policy_gradient_loss  | -0.00274    |
|    gen/train/value_loss            | 1.42   

round:  25%|██▍       | 30/122 [02:06<06:26,  4.20s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 369         |
|    gen/rollout/ep_rew_wrapped_mean | -671        |
|    gen/time/fps                    | 7872        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 507904      |
|    gen/train/approx_kl             | 0.003373016 |
|    gen/train/clip_fraction         | 0.133       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.391      |
|    gen/train/explained_variance    | 0.994       |
|    gen/train/learning_rate         | 0.0005      |
|    gen/train/loss                  | 0.0825      |
|    gen/train/n_updates             | 150         |
|    gen/train/policy_gradient_loss  | -0.00161    |
|    gen/train/value_loss            | 1.37   

round:  25%|██▌       | 31/122 [02:10<06:21,  4.19s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 410          |
|    gen/rollout/ep_rew_wrapped_mean | -595         |
|    gen/time/fps                    | 7798         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 524288       |
|    gen/train/approx_kl             | 0.0030152581 |
|    gen/train/clip_fraction         | 0.101        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.354       |
|    gen/train/explained_variance    | 0.99         |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.24         |
|    gen/train/n_updates             | 155          |
|    gen/train/policy_gradient_loss  | -0.000121    |
|    gen/train/value_loss   

round:  26%|██▌       | 32/122 [02:14<06:16,  4.18s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 444          |
|    gen/rollout/ep_rew_wrapped_mean | -445         |
|    gen/time/fps                    | 7879         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 540672       |
|    gen/train/approx_kl             | 0.0029416215 |
|    gen/train/clip_fraction         | 0.117        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.301       |
|    gen/train/explained_variance    | 0.994        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0804       |
|    gen/train/n_updates             | 160          |
|    gen/train/policy_gradient_loss  | 3.63e-05     |
|    gen/train/value_loss   

round:  27%|██▋       | 33/122 [02:19<06:14,  4.21s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 474          |
|    gen/rollout/ep_rew_wrapped_mean | -326         |
|    gen/time/fps                    | 7878         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 557056       |
|    gen/train/approx_kl             | 0.0032478413 |
|    gen/train/clip_fraction         | 0.113        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.292       |
|    gen/train/explained_variance    | 0.996        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0361       |
|    gen/train/n_updates             | 165          |
|    gen/train/policy_gradient_loss  | -0.00064     |
|    gen/train/value_loss   

round:  28%|██▊       | 34/122 [02:23<06:09,  4.20s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 488          |
|    gen/rollout/ep_rew_wrapped_mean | -215         |
|    gen/time/fps                    | 7865         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 573440       |
|    gen/train/approx_kl             | 0.0026315642 |
|    gen/train/clip_fraction         | 0.088        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.263       |
|    gen/train/explained_variance    | 0.978        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0968       |
|    gen/train/n_updates             | 170          |
|    gen/train/policy_gradient_loss  | -0.0012      |
|    gen/train/value_loss   

round:  29%|██▊       | 35/122 [02:27<06:04,  4.19s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 496          |
|    gen/rollout/ep_rew_wrapped_mean | -218         |
|    gen/time/fps                    | 7904         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 589824       |
|    gen/train/approx_kl             | 0.0013315232 |
|    gen/train/clip_fraction         | 0.071        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.272       |
|    gen/train/explained_variance    | 0.983        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.663        |
|    gen/train/n_updates             | 175          |
|    gen/train/policy_gradient_loss  | 0.00126      |
|    gen/train/value_loss   

round:  30%|██▉       | 36/122 [02:31<06:02,  4.21s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 497          |
|    gen/rollout/ep_rew_wrapped_mean | -220         |
|    gen/time/fps                    | 7892         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 606208       |
|    gen/train/approx_kl             | 0.0013373723 |
|    gen/train/clip_fraction         | 0.0699       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.267       |
|    gen/train/explained_variance    | 0.983        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.092        |
|    gen/train/n_updates             | 180          |
|    gen/train/policy_gradient_loss  | 0.000537     |
|    gen/train/value_loss   

round:  30%|███       | 37/122 [02:35<05:56,  4.19s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 498          |
|    gen/rollout/ep_rew_wrapped_mean | -242         |
|    gen/time/fps                    | 7912         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 622592       |
|    gen/train/approx_kl             | 0.0015112173 |
|    gen/train/clip_fraction         | 0.0783       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.265       |
|    gen/train/explained_variance    | 0.982        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0341       |
|    gen/train/n_updates             | 185          |
|    gen/train/policy_gradient_loss  | 0.000729     |
|    gen/train/value_loss   

round:  31%|███       | 38/122 [02:39<05:50,  4.18s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 499         |
|    gen/rollout/ep_rew_wrapped_mean | -210        |
|    gen/time/fps                    | 7945        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 638976      |
|    gen/train/approx_kl             | 0.002092414 |
|    gen/train/clip_fraction         | 0.106       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.261      |
|    gen/train/explained_variance    | 0.985       |
|    gen/train/learning_rate         | 0.0005      |
|    gen/train/loss                  | 0.00697     |
|    gen/train/n_updates             | 190         |
|    gen/train/policy_gradient_loss  | -0.00198    |
|    gen/train/value_loss            | 0.202  

round:  32%|███▏      | 39/122 [02:44<05:47,  4.19s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -209         |
|    gen/time/fps                    | 7908         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 655360       |
|    gen/train/approx_kl             | 0.0026861937 |
|    gen/train/clip_fraction         | 0.13         |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.262       |
|    gen/train/explained_variance    | 0.991        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00884      |
|    gen/train/n_updates             | 195          |
|    gen/train/policy_gradient_loss  | -0.00293     |
|    gen/train/value_loss   

round:  33%|███▎      | 40/122 [02:48<05:42,  4.18s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -179         |
|    gen/time/fps                    | 7887         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 671744       |
|    gen/train/approx_kl             | 0.0026062995 |
|    gen/train/clip_fraction         | 0.118        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.253       |
|    gen/train/explained_variance    | 0.987        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0126       |
|    gen/train/n_updates             | 200          |
|    gen/train/policy_gradient_loss  | -0.00306     |
|    gen/train/value_loss   

round:  34%|███▎      | 41/122 [02:52<05:38,  4.18s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -183         |
|    gen/time/fps                    | 7416         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 688128       |
|    gen/train/approx_kl             | 0.0044944417 |
|    gen/train/clip_fraction         | 0.172        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.257       |
|    gen/train/explained_variance    | 0.991        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0337       |
|    gen/train/n_updates             | 205          |
|    gen/train/policy_gradient_loss  | -0.0058      |
|    gen/train/value_loss   

round:  34%|███▍      | 42/122 [02:56<05:37,  4.21s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -166         |
|    gen/time/fps                    | 7888         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 704512       |
|    gen/train/approx_kl             | 0.0047647166 |
|    gen/train/clip_fraction         | 0.157        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.241       |
|    gen/train/explained_variance    | 0.981        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0259       |
|    gen/train/n_updates             | 210          |
|    gen/train/policy_gradient_loss  | -0.00384     |
|    gen/train/value_loss   

round:  35%|███▌      | 43/122 [03:00<05:31,  4.19s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -160         |
|    gen/time/fps                    | 7904         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 720896       |
|    gen/train/approx_kl             | 0.0024691685 |
|    gen/train/clip_fraction         | 0.0962       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.245       |
|    gen/train/explained_variance    | 0.973        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00742      |
|    gen/train/n_updates             | 215          |
|    gen/train/policy_gradient_loss  | -0.00184     |
|    gen/train/value_loss   

round:  36%|███▌      | 44/122 [03:05<05:28,  4.22s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -127         |
|    gen/time/fps                    | 7865         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 737280       |
|    gen/train/approx_kl             | 0.0018231185 |
|    gen/train/clip_fraction         | 0.0837       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.237       |
|    gen/train/explained_variance    | 0.964        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00379      |
|    gen/train/n_updates             | 220          |
|    gen/train/policy_gradient_loss  | -0.000937    |
|    gen/train/value_loss   

round:  37%|███▋      | 45/122 [03:09<05:23,  4.20s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -87.4        |
|    gen/time/fps                    | 7912         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 753664       |
|    gen/train/approx_kl             | 0.0015412753 |
|    gen/train/clip_fraction         | 0.0689       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.228       |
|    gen/train/explained_variance    | 0.969        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00254      |
|    gen/train/n_updates             | 225          |
|    gen/train/policy_gradient_loss  | -0.000402    |
|    gen/train/value_loss   

round:  38%|███▊      | 46/122 [03:13<05:17,  4.17s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -64.2        |
|    gen/time/fps                    | 7937         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 770048       |
|    gen/train/approx_kl             | 0.0016819426 |
|    gen/train/clip_fraction         | 0.0727       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.222       |
|    gen/train/explained_variance    | 0.971        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00256      |
|    gen/train/n_updates             | 230          |
|    gen/train/policy_gradient_loss  | -0.000324    |
|    gen/train/value_loss   

round:  39%|███▊      | 47/122 [03:17<05:14,  4.20s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 500         |
|    gen/rollout/ep_rew_wrapped_mean | -57.8       |
|    gen/time/fps                    | 7918        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 786432      |
|    gen/train/approx_kl             | 0.001996402 |
|    gen/train/clip_fraction         | 0.0834      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.223      |
|    gen/train/explained_variance    | 0.979       |
|    gen/train/learning_rate         | 0.0005      |
|    gen/train/loss                  | 9.39e-05    |
|    gen/train/n_updates             | 235         |
|    gen/train/policy_gradient_loss  | 6.8e-05     |
|    gen/train/value_loss            | 0.117  

round:  39%|███▉      | 48/122 [03:21<05:09,  4.18s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -71.3        |
|    gen/time/fps                    | 7897         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 802816       |
|    gen/train/approx_kl             | 0.0013873216 |
|    gen/train/clip_fraction         | 0.0659       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.216       |
|    gen/train/explained_variance    | 0.918        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0488       |
|    gen/train/n_updates             | 240          |
|    gen/train/policy_gradient_loss  | 0.000458     |
|    gen/train/value_loss   

round:  40%|████      | 49/122 [03:26<05:04,  4.17s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -102         |
|    gen/time/fps                    | 7897         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 819200       |
|    gen/train/approx_kl             | 0.0014868517 |
|    gen/train/clip_fraction         | 0.0622       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.216       |
|    gen/train/explained_variance    | 0.819        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0144       |
|    gen/train/n_updates             | 245          |
|    gen/train/policy_gradient_loss  | 0.000527     |
|    gen/train/value_loss   

round:  41%|████      | 50/122 [03:30<05:02,  4.20s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -136         |
|    gen/time/fps                    | 7903         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 835584       |
|    gen/train/approx_kl             | 0.0013039727 |
|    gen/train/clip_fraction         | 0.0629       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.212       |
|    gen/train/explained_variance    | 0.8          |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0441       |
|    gen/train/n_updates             | 250          |
|    gen/train/policy_gradient_loss  | 0.000297     |
|    gen/train/value_loss   

round:  42%|████▏     | 51/122 [03:34<04:56,  4.18s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -164         |
|    gen/time/fps                    | 7900         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 851968       |
|    gen/train/approx_kl             | 0.0017487617 |
|    gen/train/clip_fraction         | 0.0751       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.215       |
|    gen/train/explained_variance    | 0.816        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00599      |
|    gen/train/n_updates             | 255          |
|    gen/train/policy_gradient_loss  | 0.000241     |
|    gen/train/value_loss   

round:  43%|████▎     | 52/122 [03:38<04:52,  4.17s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -165         |
|    gen/time/fps                    | 7504         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 868352       |
|    gen/train/approx_kl             | 0.0019083681 |
|    gen/train/clip_fraction         | 0.0715       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.21        |
|    gen/train/explained_variance    | 0.816        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00187      |
|    gen/train/n_updates             | 260          |
|    gen/train/policy_gradient_loss  | 0.000166     |
|    gen/train/value_loss   

round:  43%|████▎     | 53/122 [03:42<04:49,  4.20s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -156         |
|    gen/time/fps                    | 7917         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 884736       |
|    gen/train/approx_kl             | 0.0018734494 |
|    gen/train/clip_fraction         | 0.0807       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.213       |
|    gen/train/explained_variance    | 0.87         |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0228       |
|    gen/train/n_updates             | 265          |
|    gen/train/policy_gradient_loss  | -0.000553    |
|    gen/train/value_loss   

round:  44%|████▍     | 54/122 [03:47<04:44,  4.18s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -139         |
|    gen/time/fps                    | 7899         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 901120       |
|    gen/train/approx_kl             | 0.0042347596 |
|    gen/train/clip_fraction         | 0.129        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.2         |
|    gen/train/explained_variance    | 0.954        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00882      |
|    gen/train/n_updates             | 270          |
|    gen/train/policy_gradient_loss  | -0.00318     |
|    gen/train/value_loss   

round:  45%|████▌     | 55/122 [03:51<04:41,  4.21s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -132         |
|    gen/time/fps                    | 7950         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 917504       |
|    gen/train/approx_kl             | 0.0021981834 |
|    gen/train/clip_fraction         | 0.0844       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.205       |
|    gen/train/explained_variance    | 0.797        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00571      |
|    gen/train/n_updates             | 275          |
|    gen/train/policy_gradient_loss  | -0.000354    |
|    gen/train/value_loss   

round:  46%|████▌     | 56/122 [03:55<04:36,  4.19s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -123         |
|    gen/time/fps                    | 7886         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 933888       |
|    gen/train/approx_kl             | 0.0013611612 |
|    gen/train/clip_fraction         | 0.0668       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.209       |
|    gen/train/explained_variance    | 0.783        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.000791     |
|    gen/train/n_updates             | 280          |
|    gen/train/policy_gradient_loss  | 0.000471     |
|    gen/train/value_loss   

round:  47%|████▋     | 57/122 [03:59<04:31,  4.17s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -123         |
|    gen/time/fps                    | 7864         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 950272       |
|    gen/train/approx_kl             | 0.0013810833 |
|    gen/train/clip_fraction         | 0.0629       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.218       |
|    gen/train/explained_variance    | 0.885        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00916      |
|    gen/train/n_updates             | 285          |
|    gen/train/policy_gradient_loss  | 0.000363     |
|    gen/train/value_loss   

round:  48%|████▊     | 58/122 [04:03<04:29,  4.20s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -126         |
|    gen/time/fps                    | 7878         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 966656       |
|    gen/train/approx_kl             | 0.0016557232 |
|    gen/train/clip_fraction         | 0.0725       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.212       |
|    gen/train/explained_variance    | 0.972        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00205      |
|    gen/train/n_updates             | 290          |
|    gen/train/policy_gradient_loss  | -0.000237    |
|    gen/train/value_loss   

round:  48%|████▊     | 59/122 [04:08<04:24,  4.19s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -131         |
|    gen/time/fps                    | 7885         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 983040       |
|    gen/train/approx_kl             | 0.0019946336 |
|    gen/train/clip_fraction         | 0.0736       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.211       |
|    gen/train/explained_variance    | 0.964        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0323       |
|    gen/train/n_updates             | 295          |
|    gen/train/policy_gradient_loss  | -0.000127    |
|    gen/train/value_loss   

round:  49%|████▉     | 60/122 [04:12<04:19,  4.18s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -140         |
|    gen/time/fps                    | 7905         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 999424       |
|    gen/train/approx_kl             | 0.0017185234 |
|    gen/train/clip_fraction         | 0.0766       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.203       |
|    gen/train/explained_variance    | 0.965        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00765      |
|    gen/train/n_updates             | 300          |
|    gen/train/policy_gradient_loss  | -9.57e-05    |
|    gen/train/value_loss   

round:  50%|█████     | 61/122 [04:16<04:16,  4.21s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 500         |
|    gen/rollout/ep_rew_wrapped_mean | -153        |
|    gen/time/fps                    | 7935        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 1015808     |
|    gen/train/approx_kl             | 0.001679355 |
|    gen/train/clip_fraction         | 0.0681      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.203      |
|    gen/train/explained_variance    | 0.948       |
|    gen/train/learning_rate         | 0.0005      |
|    gen/train/loss                  | 0.0232      |
|    gen/train/n_updates             | 305         |
|    gen/train/policy_gradient_loss  | 0.000134    |
|    gen/train/value_loss            | 0.117  

round:  51%|█████     | 62/122 [04:20<04:11,  4.19s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 500         |
|    gen/rollout/ep_rew_wrapped_mean | -164        |
|    gen/time/fps                    | 7888        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 1032192     |
|    gen/train/approx_kl             | 0.001383436 |
|    gen/train/clip_fraction         | 0.0624      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.201      |
|    gen/train/explained_variance    | 0.914       |
|    gen/train/learning_rate         | 0.0005      |
|    gen/train/loss                  | 0.0295      |
|    gen/train/n_updates             | 310         |
|    gen/train/policy_gradient_loss  | 2.79e-05    |
|    gen/train/value_loss            | 0.141  

round:  52%|█████▏    | 63/122 [04:24<04:06,  4.18s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -167         |
|    gen/time/fps                    | 7892         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1048576      |
|    gen/train/approx_kl             | 0.0015933271 |
|    gen/train/clip_fraction         | 0.0807       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.194       |
|    gen/train/explained_variance    | 0.928        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00293      |
|    gen/train/n_updates             | 315          |
|    gen/train/policy_gradient_loss  | -0.000733    |
|    gen/train/value_loss   

round:  52%|█████▏    | 64/122 [04:28<04:03,  4.20s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 500         |
|    gen/rollout/ep_rew_wrapped_mean | -158        |
|    gen/time/fps                    | 7920        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 1064960     |
|    gen/train/approx_kl             | 0.003003061 |
|    gen/train/clip_fraction         | 0.114       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.197      |
|    gen/train/explained_variance    | 0.967       |
|    gen/train/learning_rate         | 0.0005      |
|    gen/train/loss                  | 0.0111      |
|    gen/train/n_updates             | 320         |
|    gen/train/policy_gradient_loss  | -0.00168    |
|    gen/train/value_loss            | 0.0658 

round:  53%|█████▎    | 65/122 [04:33<03:57,  4.17s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -136         |
|    gen/time/fps                    | 7885         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1081344      |
|    gen/train/approx_kl             | 0.0016688723 |
|    gen/train/clip_fraction         | 0.0777       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.189       |
|    gen/train/explained_variance    | 0.94         |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0258       |
|    gen/train/n_updates             | 325          |
|    gen/train/policy_gradient_loss  | 0.000431     |
|    gen/train/value_loss   

round:  54%|█████▍    | 66/122 [04:37<03:52,  4.15s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -121         |
|    gen/time/fps                    | 7885         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1097728      |
|    gen/train/approx_kl             | 0.0010118785 |
|    gen/train/clip_fraction         | 0.0516       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.189       |
|    gen/train/explained_variance    | 0.923        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00388      |
|    gen/train/n_updates             | 330          |
|    gen/train/policy_gradient_loss  | 0.000984     |
|    gen/train/value_loss   

round:  55%|█████▍    | 67/122 [04:41<03:49,  4.18s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -115         |
|    gen/time/fps                    | 7911         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1114112      |
|    gen/train/approx_kl             | 0.0012237182 |
|    gen/train/clip_fraction         | 0.0571       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.198       |
|    gen/train/explained_variance    | 0.934        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0175       |
|    gen/train/n_updates             | 335          |
|    gen/train/policy_gradient_loss  | 0.000813     |
|    gen/train/value_loss   

round:  56%|█████▌    | 68/122 [04:45<03:44,  4.17s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 500        |
|    gen/rollout/ep_rew_mean         | 500        |
|    gen/rollout/ep_rew_wrapped_mean | -124       |
|    gen/time/fps                    | 7919       |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 2          |
|    gen/time/total_timesteps        | 1130496    |
|    gen/train/approx_kl             | 0.00125026 |
|    gen/train/clip_fraction         | 0.0512     |
|    gen/train/clip_range            | 0.1        |
|    gen/train/entropy_loss          | -0.188     |
|    gen/train/explained_variance    | 0.97       |
|    gen/train/learning_rate         | 0.0005     |
|    gen/train/loss                  | 0.00254    |
|    gen/train/n_updates             | 340        |
|    gen/train/policy_gradient_loss  | 0.0012     |
|    gen/train/value_loss            | 0.0822     |
------------

round:  57%|█████▋    | 69/122 [04:49<03:40,  4.15s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -134         |
|    gen/time/fps                    | 7916         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1146880      |
|    gen/train/approx_kl             | 0.0012419142 |
|    gen/train/clip_fraction         | 0.0528       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.193       |
|    gen/train/explained_variance    | 0.984        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00673      |
|    gen/train/n_updates             | 345          |
|    gen/train/policy_gradient_loss  | 0.00102      |
|    gen/train/value_loss   

round:  57%|█████▋    | 70/122 [04:53<03:37,  4.19s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -136         |
|    gen/time/fps                    | 7919         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1163264      |
|    gen/train/approx_kl             | 0.0011923702 |
|    gen/train/clip_fraction         | 0.0609       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.184       |
|    gen/train/explained_variance    | 0.988        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0113       |
|    gen/train/n_updates             | 350          |
|    gen/train/policy_gradient_loss  | 0.00136      |
|    gen/train/value_loss   

round:  58%|█████▊    | 71/122 [04:58<03:32,  4.17s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -137         |
|    gen/time/fps                    | 7907         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1179648      |
|    gen/train/approx_kl             | 0.0014900529 |
|    gen/train/clip_fraction         | 0.0579       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.189       |
|    gen/train/explained_variance    | 0.985        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00762      |
|    gen/train/n_updates             | 355          |
|    gen/train/policy_gradient_loss  | 0.00117      |
|    gen/train/value_loss   

round:  59%|█████▉    | 72/122 [05:02<03:28,  4.16s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -144         |
|    gen/time/fps                    | 7844         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1196032      |
|    gen/train/approx_kl             | 0.0017126453 |
|    gen/train/clip_fraction         | 0.0612       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.177       |
|    gen/train/explained_variance    | 0.989        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0113       |
|    gen/train/n_updates             | 360          |
|    gen/train/policy_gradient_loss  | 0.000668     |
|    gen/train/value_loss   

round:  60%|█████▉    | 73/122 [05:06<03:26,  4.22s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -156         |
|    gen/time/fps                    | 7706         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1212416      |
|    gen/train/approx_kl             | 0.0022147195 |
|    gen/train/clip_fraction         | 0.0623       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.175       |
|    gen/train/explained_variance    | 0.985        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0229       |
|    gen/train/n_updates             | 365          |
|    gen/train/policy_gradient_loss  | 0.000928     |
|    gen/train/value_loss   

round:  61%|██████    | 74/122 [05:10<03:22,  4.22s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -168         |
|    gen/time/fps                    | 7787         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1228800      |
|    gen/train/approx_kl             | 0.0014107237 |
|    gen/train/clip_fraction         | 0.0557       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.172       |
|    gen/train/explained_variance    | 0.972        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0121       |
|    gen/train/n_updates             | 370          |
|    gen/train/policy_gradient_loss  | 0.000909     |
|    gen/train/value_loss   

round:  61%|██████▏   | 75/122 [05:15<03:18,  4.23s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -148         |
|    gen/time/fps                    | 7774         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1245184      |
|    gen/train/approx_kl             | 0.0012217183 |
|    gen/train/clip_fraction         | 0.0575       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.176       |
|    gen/train/explained_variance    | 0.993        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | -0.00583     |
|    gen/train/n_updates             | 375          |
|    gen/train/policy_gradient_loss  | 0.000795     |
|    gen/train/value_loss   

round:  62%|██████▏   | 76/122 [05:19<03:16,  4.27s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -118         |
|    gen/time/fps                    | 7794         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1261568      |
|    gen/train/approx_kl             | 0.0035412968 |
|    gen/train/clip_fraction         | 0.0807       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.174       |
|    gen/train/explained_variance    | 0.994        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | -0.00298     |
|    gen/train/n_updates             | 380          |
|    gen/train/policy_gradient_loss  | 0.00112      |
|    gen/train/value_loss   

round:  63%|██████▎   | 77/122 [05:23<03:10,  4.24s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -95.2        |
|    gen/time/fps                    | 7802         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1277952      |
|    gen/train/approx_kl             | 0.0013529107 |
|    gen/train/clip_fraction         | 0.0562       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.179       |
|    gen/train/explained_variance    | 0.989        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00289      |
|    gen/train/n_updates             | 385          |
|    gen/train/policy_gradient_loss  | 0.00127      |
|    gen/train/value_loss   

round:  64%|██████▍   | 78/122 [05:27<03:05,  4.22s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 500         |
|    gen/rollout/ep_rew_wrapped_mean | -92.2       |
|    gen/time/fps                    | 7387        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 1294336     |
|    gen/train/approx_kl             | 0.001522611 |
|    gen/train/clip_fraction         | 0.0572      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.177      |
|    gen/train/explained_variance    | 0.989       |
|    gen/train/learning_rate         | 0.0005      |
|    gen/train/loss                  | 0.0178      |
|    gen/train/n_updates             | 390         |
|    gen/train/policy_gradient_loss  | 0.00119     |
|    gen/train/value_loss            | 0.0515 

round:  65%|██████▍   | 79/122 [05:32<03:02,  4.25s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -103         |
|    gen/time/fps                    | 7913         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1310720      |
|    gen/train/approx_kl             | 0.0014209982 |
|    gen/train/clip_fraction         | 0.0578       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.169       |
|    gen/train/explained_variance    | 0.993        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 3.28e-05     |
|    gen/train/n_updates             | 395          |
|    gen/train/policy_gradient_loss  | 0.00088      |
|    gen/train/value_loss   

round:  66%|██████▌   | 80/122 [05:36<02:57,  4.22s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -116         |
|    gen/time/fps                    | 7851         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1327104      |
|    gen/train/approx_kl             | 0.0016369822 |
|    gen/train/clip_fraction         | 0.0597       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.167       |
|    gen/train/explained_variance    | 0.993        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.021        |
|    gen/train/n_updates             | 400          |
|    gen/train/policy_gradient_loss  | 0.00136      |
|    gen/train/value_loss   

round:  66%|██████▋   | 81/122 [05:40<02:53,  4.23s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -132         |
|    gen/time/fps                    | 7930         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1343488      |
|    gen/train/approx_kl             | 0.0015123081 |
|    gen/train/clip_fraction         | 0.0602       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.166       |
|    gen/train/explained_variance    | 0.994        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0186       |
|    gen/train/n_updates             | 405          |
|    gen/train/policy_gradient_loss  | 0.000733     |
|    gen/train/value_loss   

round:  67%|██████▋   | 82/122 [05:44<02:48,  4.21s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -149         |
|    gen/time/fps                    | 7924         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1359872      |
|    gen/train/approx_kl             | 0.0013680488 |
|    gen/train/clip_fraction         | 0.0628       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.165       |
|    gen/train/explained_variance    | 0.992        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.011        |
|    gen/train/n_updates             | 410          |
|    gen/train/policy_gradient_loss  | 0.00106      |
|    gen/train/value_loss   

round:  68%|██████▊   | 83/122 [05:48<02:43,  4.19s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -156         |
|    gen/time/fps                    | 7875         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1376256      |
|    gen/train/approx_kl             | 0.0019548116 |
|    gen/train/clip_fraction         | 0.0589       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.166       |
|    gen/train/explained_variance    | 0.98         |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00397      |
|    gen/train/n_updates             | 415          |
|    gen/train/policy_gradient_loss  | 0.00115      |
|    gen/train/value_loss   

round:  69%|██████▉   | 84/122 [05:53<02:40,  4.21s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -153         |
|    gen/time/fps                    | 7904         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1392640      |
|    gen/train/approx_kl             | 0.0015132802 |
|    gen/train/clip_fraction         | 0.0591       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.168       |
|    gen/train/explained_variance    | 0.972        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00514      |
|    gen/train/n_updates             | 420          |
|    gen/train/policy_gradient_loss  | 0.00122      |
|    gen/train/value_loss   

round:  70%|██████▉   | 85/122 [05:57<02:35,  4.20s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -151         |
|    gen/time/fps                    | 7902         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1409024      |
|    gen/train/approx_kl             | 0.0021492743 |
|    gen/train/clip_fraction         | 0.0603       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.159       |
|    gen/train/explained_variance    | 0.979        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00775      |
|    gen/train/n_updates             | 425          |
|    gen/train/policy_gradient_loss  | 0.000661     |
|    gen/train/value_loss   

round:  70%|███████   | 86/122 [06:01<02:30,  4.19s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -132         |
|    gen/time/fps                    | 7521         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1425408      |
|    gen/train/approx_kl             | 0.0026053751 |
|    gen/train/clip_fraction         | 0.0753       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.166       |
|    gen/train/explained_variance    | 0.993        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00171      |
|    gen/train/n_updates             | 430          |
|    gen/train/policy_gradient_loss  | 0.000334     |
|    gen/train/value_loss   

round:  71%|███████▏  | 87/122 [06:05<02:26,  4.20s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 500         |
|    gen/rollout/ep_rew_wrapped_mean | -107        |
|    gen/time/fps                    | 7884        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 1441792     |
|    gen/train/approx_kl             | 0.001344504 |
|    gen/train/clip_fraction         | 0.0529      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.167      |
|    gen/train/explained_variance    | 0.985       |
|    gen/train/learning_rate         | 0.0005      |
|    gen/train/loss                  | 0.00477     |
|    gen/train/n_updates             | 435         |
|    gen/train/policy_gradient_loss  | 0.00094     |
|    gen/train/value_loss            | 0.0462 

round:  72%|███████▏  | 88/122 [06:09<02:22,  4.19s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -101         |
|    gen/time/fps                    | 7886         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1458176      |
|    gen/train/approx_kl             | 0.0010182566 |
|    gen/train/clip_fraction         | 0.048        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.165       |
|    gen/train/explained_variance    | 0.986        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00314      |
|    gen/train/n_updates             | 440          |
|    gen/train/policy_gradient_loss  | 0.000999     |
|    gen/train/value_loss   

round:  73%|███████▎  | 89/122 [06:13<02:17,  4.18s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -111         |
|    gen/time/fps                    | 7865         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1474560      |
|    gen/train/approx_kl             | 0.0016567563 |
|    gen/train/clip_fraction         | 0.061        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.162       |
|    gen/train/explained_variance    | 0.992        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0125       |
|    gen/train/n_updates             | 445          |
|    gen/train/policy_gradient_loss  | 0.00149      |
|    gen/train/value_loss   

round:  74%|███████▍  | 90/122 [06:18<02:14,  4.21s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -125         |
|    gen/time/fps                    | 7897         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1490944      |
|    gen/train/approx_kl             | 0.0014661455 |
|    gen/train/clip_fraction         | 0.054        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.16        |
|    gen/train/explained_variance    | 0.994        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00498      |
|    gen/train/n_updates             | 450          |
|    gen/train/policy_gradient_loss  | 0.00086      |
|    gen/train/value_loss   

round:  75%|███████▍  | 91/122 [06:22<02:10,  4.19s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -129         |
|    gen/time/fps                    | 7897         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1507328      |
|    gen/train/approx_kl             | 0.0017633503 |
|    gen/train/clip_fraction         | 0.0497       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.157       |
|    gen/train/explained_variance    | 0.991        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0125       |
|    gen/train/n_updates             | 455          |
|    gen/train/policy_gradient_loss  | 0.00108      |
|    gen/train/value_loss   

round:  75%|███████▌  | 92/122 [06:26<02:05,  4.18s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -137         |
|    gen/time/fps                    | 7902         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1523712      |
|    gen/train/approx_kl             | 0.0027699296 |
|    gen/train/clip_fraction         | 0.0668       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.155       |
|    gen/train/explained_variance    | 0.995        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0097       |
|    gen/train/n_updates             | 460          |
|    gen/train/policy_gradient_loss  | 0.000873     |
|    gen/train/value_loss   

round:  76%|███████▌  | 93/122 [06:30<02:02,  4.21s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -143         |
|    gen/time/fps                    | 7874         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1540096      |
|    gen/train/approx_kl             | 0.0017491133 |
|    gen/train/clip_fraction         | 0.056        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.147       |
|    gen/train/explained_variance    | 0.989        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00922      |
|    gen/train/n_updates             | 465          |
|    gen/train/policy_gradient_loss  | 0.00064      |
|    gen/train/value_loss   

round:  77%|███████▋  | 94/122 [06:34<01:57,  4.20s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 500        |
|    gen/rollout/ep_rew_mean         | 500        |
|    gen/rollout/ep_rew_wrapped_mean | -166       |
|    gen/time/fps                    | 7853       |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 2          |
|    gen/time/total_timesteps        | 1556480    |
|    gen/train/approx_kl             | 0.00157512 |
|    gen/train/clip_fraction         | 0.0475     |
|    gen/train/clip_range            | 0.1        |
|    gen/train/entropy_loss          | -0.142     |
|    gen/train/explained_variance    | 0.975      |
|    gen/train/learning_rate         | 0.0005     |
|    gen/train/loss                  | 0.0103     |
|    gen/train/n_updates             | 470        |
|    gen/train/policy_gradient_loss  | 0.00147    |
|    gen/train/value_loss            | 0.278      |
------------

round:  78%|███████▊  | 95/122 [06:39<01:53,  4.19s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -176         |
|    gen/time/fps                    | 7423         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1572864      |
|    gen/train/approx_kl             | 0.0015949288 |
|    gen/train/clip_fraction         | 0.0575       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.141       |
|    gen/train/explained_variance    | 0.969        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0112       |
|    gen/train/n_updates             | 475          |
|    gen/train/policy_gradient_loss  | 0.00123      |
|    gen/train/value_loss   

round:  79%|███████▊  | 96/122 [06:43<01:49,  4.21s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -160         |
|    gen/time/fps                    | 7908         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1589248      |
|    gen/train/approx_kl             | 0.0015636419 |
|    gen/train/clip_fraction         | 0.0562       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.148       |
|    gen/train/explained_variance    | 0.995        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.000753     |
|    gen/train/n_updates             | 480          |
|    gen/train/policy_gradient_loss  | 0.000613     |
|    gen/train/value_loss   

round:  80%|███████▉  | 97/122 [06:47<01:44,  4.19s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -128         |
|    gen/time/fps                    | 7884         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1605632      |
|    gen/train/approx_kl             | 0.0011989621 |
|    gen/train/clip_fraction         | 0.0513       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.149       |
|    gen/train/explained_variance    | 0.987        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00535      |
|    gen/train/n_updates             | 485          |
|    gen/train/policy_gradient_loss  | 0.00103      |
|    gen/train/value_loss   

round:  80%|████████  | 98/122 [06:51<01:41,  4.21s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 500        |
|    gen/rollout/ep_rew_mean         | 500        |
|    gen/rollout/ep_rew_wrapped_mean | -99.8      |
|    gen/time/fps                    | 7882       |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 2          |
|    gen/time/total_timesteps        | 1622016    |
|    gen/train/approx_kl             | 0.00131696 |
|    gen/train/clip_fraction         | 0.0518     |
|    gen/train/clip_range            | 0.1        |
|    gen/train/entropy_loss          | -0.148     |
|    gen/train/explained_variance    | 0.979      |
|    gen/train/learning_rate         | 0.0005     |
|    gen/train/loss                  | 0.013      |
|    gen/train/n_updates             | 490        |
|    gen/train/policy_gradient_loss  | 0.00161    |
|    gen/train/value_loss            | 0.0356     |
------------

round:  81%|████████  | 99/122 [06:56<01:36,  4.20s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -106         |
|    gen/time/fps                    | 7933         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1638400      |
|    gen/train/approx_kl             | 0.0012314331 |
|    gen/train/clip_fraction         | 0.0492       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.147       |
|    gen/train/explained_variance    | 0.983        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00214      |
|    gen/train/n_updates             | 495          |
|    gen/train/policy_gradient_loss  | 0.000931     |
|    gen/train/value_loss   

round:  82%|████████▏ | 100/122 [07:00<01:31,  4.18s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -114         |
|    gen/time/fps                    | 7838         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1654784      |
|    gen/train/approx_kl             | 0.0017235074 |
|    gen/train/clip_fraction         | 0.0554       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.151       |
|    gen/train/explained_variance    | 0.994        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0143       |
|    gen/train/n_updates             | 500          |
|    gen/train/policy_gradient_loss  | 0.000948     |
|    gen/train/value_loss   

round:  83%|████████▎ | 101/122 [07:04<01:28,  4.21s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -120         |
|    gen/time/fps                    | 7915         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1671168      |
|    gen/train/approx_kl             | 0.0016880758 |
|    gen/train/clip_fraction         | 0.0524       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.142       |
|    gen/train/explained_variance    | 0.997        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0235       |
|    gen/train/n_updates             | 505          |
|    gen/train/policy_gradient_loss  | 0.00122      |
|    gen/train/value_loss   

round:  84%|████████▎ | 102/122 [07:08<01:23,  4.20s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -124         |
|    gen/time/fps                    | 7914         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1687552      |
|    gen/train/approx_kl             | 0.0028758654 |
|    gen/train/clip_fraction         | 0.0578       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.135       |
|    gen/train/explained_variance    | 0.995        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00603      |
|    gen/train/n_updates             | 510          |
|    gen/train/policy_gradient_loss  | 0.000807     |
|    gen/train/value_loss   

round:  84%|████████▍ | 103/122 [07:12<01:19,  4.19s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -146         |
|    gen/time/fps                    | 7944         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1703936      |
|    gen/train/approx_kl             | 0.0021047588 |
|    gen/train/clip_fraction         | 0.0564       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.14        |
|    gen/train/explained_variance    | 0.979        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00266      |
|    gen/train/n_updates             | 515          |
|    gen/train/policy_gradient_loss  | 0.00137      |
|    gen/train/value_loss   

round:  85%|████████▌ | 104/122 [07:16<01:15,  4.20s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -159         |
|    gen/time/fps                    | 7894         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1720320      |
|    gen/train/approx_kl             | 0.0023008136 |
|    gen/train/clip_fraction         | 0.0602       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.144       |
|    gen/train/explained_variance    | 0.933        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0211       |
|    gen/train/n_updates             | 520          |
|    gen/train/policy_gradient_loss  | 0.00132      |
|    gen/train/value_loss   

round:  86%|████████▌ | 105/122 [07:21<01:11,  4.18s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -171         |
|    gen/time/fps                    | 7866         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1736704      |
|    gen/train/approx_kl             | 0.0016984557 |
|    gen/train/clip_fraction         | 0.0499       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.142       |
|    gen/train/explained_variance    | 0.983        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0125       |
|    gen/train/n_updates             | 525          |
|    gen/train/policy_gradient_loss  | 0.000988     |
|    gen/train/value_loss   

round:  87%|████████▋ | 106/122 [07:25<01:06,  4.18s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -158         |
|    gen/time/fps                    | 7839         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1753088      |
|    gen/train/approx_kl             | 0.0032437537 |
|    gen/train/clip_fraction         | 0.0964       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.146       |
|    gen/train/explained_variance    | 0.997        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.000816     |
|    gen/train/n_updates             | 530          |
|    gen/train/policy_gradient_loss  | -0.00135     |
|    gen/train/value_loss   

round:  88%|████████▊ | 107/122 [07:29<01:03,  4.21s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -144         |
|    gen/time/fps                    | 7897         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1769472      |
|    gen/train/approx_kl             | 0.0014853106 |
|    gen/train/clip_fraction         | 0.0529       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.138       |
|    gen/train/explained_variance    | 0.988        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00398      |
|    gen/train/n_updates             | 535          |
|    gen/train/policy_gradient_loss  | 0.00124      |
|    gen/train/value_loss   

round:  89%|████████▊ | 108/122 [07:33<00:58,  4.19s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -111         |
|    gen/time/fps                    | 7905         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1785856      |
|    gen/train/approx_kl             | 0.0014953479 |
|    gen/train/clip_fraction         | 0.0497       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.137       |
|    gen/train/explained_variance    | 0.969        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00228      |
|    gen/train/n_updates             | 540          |
|    gen/train/policy_gradient_loss  | 0.000911     |
|    gen/train/value_loss   

round:  89%|████████▉ | 109/122 [07:37<00:54,  4.18s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -89.8        |
|    gen/time/fps                    | 7888         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1802240      |
|    gen/train/approx_kl             | 0.0016333138 |
|    gen/train/clip_fraction         | 0.0516       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.145       |
|    gen/train/explained_variance    | 0.979        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00425      |
|    gen/train/n_updates             | 545          |
|    gen/train/policy_gradient_loss  | 0.00103      |
|    gen/train/value_loss   

round:  90%|█████████ | 110/122 [07:42<00:50,  4.21s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -90.5        |
|    gen/time/fps                    | 7884         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1818624      |
|    gen/train/approx_kl             | 0.0011128657 |
|    gen/train/clip_fraction         | 0.0415       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.134       |
|    gen/train/explained_variance    | 0.989        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0102       |
|    gen/train/n_updates             | 550          |
|    gen/train/policy_gradient_loss  | 0.000642     |
|    gen/train/value_loss   

round:  91%|█████████ | 111/122 [07:46<00:46,  4.20s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -101         |
|    gen/time/fps                    | 7874         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1835008      |
|    gen/train/approx_kl             | 0.0020772272 |
|    gen/train/clip_fraction         | 0.05         |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.141       |
|    gen/train/explained_variance    | 0.995        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | -0.00586     |
|    gen/train/n_updates             | 555          |
|    gen/train/policy_gradient_loss  | 0.000822     |
|    gen/train/value_loss   

round:  92%|█████████▏| 112/122 [07:50<00:41,  4.19s/it]

------------------------------------------------------
| raw/                               |               |
|    gen/rollout/ep_len_mean         | 500           |
|    gen/rollout/ep_rew_mean         | 500           |
|    gen/rollout/ep_rew_wrapped_mean | -112          |
|    gen/time/fps                    | 7909          |
|    gen/time/iterations             | 1             |
|    gen/time/time_elapsed           | 2             |
|    gen/time/total_timesteps        | 1851392       |
|    gen/train/approx_kl             | 0.00083784066 |
|    gen/train/clip_fraction         | 0.0407        |
|    gen/train/clip_range            | 0.1           |
|    gen/train/entropy_loss          | -0.144        |
|    gen/train/explained_variance    | 0.995         |
|    gen/train/learning_rate         | 0.0005        |
|    gen/train/loss                  | 0.000253      |
|    gen/train/n_updates             | 560           |
|    gen/train/policy_gradient_loss  | 0.000962      |
|    gen/t

round:  93%|█████████▎| 113/122 [07:54<00:37,  4.22s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -139         |
|    gen/time/fps                    | 7900         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1867776      |
|    gen/train/approx_kl             | 0.0011376989 |
|    gen/train/clip_fraction         | 0.0432       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.142       |
|    gen/train/explained_variance    | 0.994        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.0231       |
|    gen/train/n_updates             | 565          |
|    gen/train/policy_gradient_loss  | 0.00113      |
|    gen/train/value_loss   

round:  93%|█████████▎| 114/122 [07:58<00:33,  4.20s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 500         |
|    gen/rollout/ep_rew_wrapped_mean | -185        |
|    gen/time/fps                    | 7840        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 1884160     |
|    gen/train/approx_kl             | 0.002443911 |
|    gen/train/clip_fraction         | 0.0575      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.139      |
|    gen/train/explained_variance    | 0.996       |
|    gen/train/learning_rate         | 0.0005      |
|    gen/train/loss                  | 0.00525     |
|    gen/train/n_updates             | 570         |
|    gen/train/policy_gradient_loss  | 0.000496    |
|    gen/train/value_loss            | 0.222  

round:  94%|█████████▍| 115/122 [08:03<00:29,  4.20s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -215         |
|    gen/time/fps                    | 7465         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1900544      |
|    gen/train/approx_kl             | 0.0019012736 |
|    gen/train/clip_fraction         | 0.0562       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.137       |
|    gen/train/explained_variance    | 0.995        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00958      |
|    gen/train/n_updates             | 575          |
|    gen/train/policy_gradient_loss  | 0.0013       |
|    gen/train/value_loss   

round:  95%|█████████▌| 116/122 [08:07<00:25,  4.22s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -205         |
|    gen/time/fps                    | 7847         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1916928      |
|    gen/train/approx_kl             | 0.0012646666 |
|    gen/train/clip_fraction         | 0.0489       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.139       |
|    gen/train/explained_variance    | 0.998        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00321      |
|    gen/train/n_updates             | 580          |
|    gen/train/policy_gradient_loss  | 0.00138      |
|    gen/train/value_loss   

round:  96%|█████████▌| 117/122 [08:11<00:21,  4.21s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -150         |
|    gen/time/fps                    | 7896         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1933312      |
|    gen/train/approx_kl             | 0.0019948862 |
|    gen/train/clip_fraction         | 0.056        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.144       |
|    gen/train/explained_variance    | 0.999        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00743      |
|    gen/train/n_updates             | 585          |
|    gen/train/policy_gradient_loss  | 0.000768     |
|    gen/train/value_loss   

round:  97%|█████████▋| 118/122 [08:15<00:16,  4.19s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 500         |
|    gen/rollout/ep_rew_wrapped_mean | -129        |
|    gen/time/fps                    | 7482        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 1949696     |
|    gen/train/approx_kl             | 0.001033364 |
|    gen/train/clip_fraction         | 0.0421      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.139      |
|    gen/train/explained_variance    | 0.995       |
|    gen/train/learning_rate         | 0.0005      |
|    gen/train/loss                  | 0.00117     |
|    gen/train/n_updates             | 590         |
|    gen/train/policy_gradient_loss  | 0.00116     |
|    gen/train/value_loss            | 0.0313 

round:  98%|█████████▊| 119/122 [08:20<00:12,  4.22s/it]

------------------------------------------------------
| raw/                               |               |
|    gen/rollout/ep_len_mean         | 500           |
|    gen/rollout/ep_rew_mean         | 500           |
|    gen/rollout/ep_rew_wrapped_mean | -114          |
|    gen/time/fps                    | 7832          |
|    gen/time/iterations             | 1             |
|    gen/time/time_elapsed           | 2             |
|    gen/time/total_timesteps        | 1966080       |
|    gen/train/approx_kl             | 0.00082639157 |
|    gen/train/clip_fraction         | 0.0406        |
|    gen/train/clip_range            | 0.1           |
|    gen/train/entropy_loss          | -0.136        |
|    gen/train/explained_variance    | 0.996         |
|    gen/train/learning_rate         | 0.0005        |
|    gen/train/loss                  | -0.00732      |
|    gen/train/n_updates             | 595           |
|    gen/train/policy_gradient_loss  | 0.000985      |
|    gen/t

round:  98%|█████████▊| 120/122 [08:24<00:08,  4.21s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -101         |
|    gen/time/fps                    | 7900         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1982464      |
|    gen/train/approx_kl             | 0.0016559655 |
|    gen/train/clip_fraction         | 0.0476       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.138       |
|    gen/train/explained_variance    | 0.997        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.00312      |
|    gen/train/n_updates             | 600          |
|    gen/train/policy_gradient_loss  | 0.00107      |
|    gen/train/value_loss   

round:  99%|█████████▉| 121/122 [08:28<00:04,  4.20s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | -117         |
|    gen/time/fps                    | 7497         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 1998848      |
|    gen/train/approx_kl             | 0.0011095797 |
|    gen/train/clip_fraction         | 0.0367       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.151       |
|    gen/train/explained_variance    | 0.994        |
|    gen/train/learning_rate         | 0.0005       |
|    gen/train/loss                  | 0.011        |
|    gen/train/n_updates             | 605          |
|    gen/train/policy_gradient_loss  | 0.000833     |
|    gen/train/value_loss   

round: 100%|██████████| 122/122 [08:32<00:00,  4.20s/it]


Visualization of trained Learner Policy (AIRL Generator):

In [8]:
import gymnasium as gym
import os
from IPython.display import display, clear_output
import matplotlib.pyplot as plt

render_mode = "human" if os.environ.get('DISPLAY') else "rgb_array"
env = gym.make("seals:seals/CartPole-v0", render_mode=render_mode)

# Gymnasium returns (obs, info) from reset(); unpack so `obs` is the raw observation
obs, info = env.reset()

for _ in range(1000):
    # Pass only the observation (not the (obs, info) tuple) to the SB3 policy
    action, _ = learner.predict(obs, deterministic=True)

    # Gymnasium step returns (obs, reward, terminated, truncated, info)
    obs, rew, terminated, truncated, info = env.step(action)

    # Render depending on mode. For `rgb_array` we collect and display frames inside the notebook.
    if render_mode == "rgb_array":
        frame = env.render()
        # display every 10 frames to avoid excessive output
        if (_ % 10) == 0:
            clear_output(wait=True)
            plt.imshow(frame)
            plt.axis('off')
            display(plt.gcf())
    else:
        env.render()   # opens a window (when DISPLAY is available)

    # episode end when either terminated or truncated
    if terminated or truncated:
        obs, info = env.reset()
        # if running in rgb_array mode, break after one episode to show the frames
        if render_mode == "rgb_array":
            break

env.close()

KeyboardInterrupt: 

We can see that an untrained policy performs poorly, while AIRL brings an improvement.

In [None]:
print(
    "Rewards before training:",
    np.mean(learner_rewards_before_training),
    "+/-",
    np.std(learner_rewards_before_training),
)
print(
    "Rewards after training:",
    np.mean(learner_rewards_after_training),
    "+/-",
    np.std(learner_rewards_after_training),
)

Rewards before training: 102.6 +/- 24.11514047232568
Rewards after training: 500.0 +/- 0.0


Training preparation of RL Agent with AIRL Reward Net:

In [10]:
# Use learned reward_net as reward function (SAC if continuous, DQN if discrete).
# The wrapper replaces the environment reward with reward_net.predict_processed(...).
import numpy as np
import gymnasium as gym
from stable_baselines3 import SAC, DQN
from stable_baselines3.sac import MlpPolicy as SACPolicy
from stable_baselines3.dqn import MlpPolicy as DQNPolicy
from stable_baselines3.common.evaluation import evaluate_policy

# Choose the reward net: prefer airl_trainer.reward_test if available
base_reward_net = None
if 'airl_trainer' in globals() and getattr(globals()['airl_trainer'], 'reward_test', None) is not None:
    base_reward_net = globals()['airl_trainer'].reward_test
    print('Using airl_trainer.reward_test as reward_net.')
elif 'reward_net' in globals():
    base_reward_net = globals()['reward_net']
    print('Using reward_net as reward_net.')
else:
    raise RuntimeError('Kein reward_net gefunden. Führe zuerst die AIRL-Zellen aus.')

class RewardNetRewardWrapper(gym.Wrapper):
    """Env wrapper that replaces the environment reward with a learned reward_net.
    Expects observations to be plain ndarrays (not Dict obs).
    """
    def __init__(self, env, reward_net):
        super().__init__(env)
        self.reward_net = reward_net
        self.last_obs = None

    def reset(self, **kwargs):
        res = self.env.reset(**kwargs)
        if isinstance(res, tuple):
            obs, info = res
        else:
            obs, info = res, {}
        self.last_obs = np.asarray(obs)
        return res

    def step(self, action):
        next_obs, _, terminated, truncated, info = self.env.step(action)
        done = bool(terminated or truncated)
        s = np.expand_dims(np.asarray(self.last_obs), 0)
        a = np.expand_dims(np.asarray(action), 0)
        ns = np.expand_dims(np.asarray(next_obs), 0)
        d = np.array([done], dtype=float)
        # use predict_processed if available
        if hasattr(self.reward_net, 'predict_processed'):
            r = float(self.reward_net.predict_processed(s, a, ns, d)[0])
        else:
            r = float(self.reward_net.predict(s, a, ns, d)[0])
        self.last_obs = np.asarray(next_obs)
        return next_obs, r, terminated, truncated, info

# Create base env and wrap
base_env = gym.make('seals:seals/CartPole-v0')
wrapped_env = RewardNetRewardWrapper(base_env, base_reward_net)

# Choose algorithm compatible with action space
# We imported `gymnasium as gym` above; avoid importing the legacy `gym` package.
if isinstance(wrapped_env.action_space, gym.spaces.Box):
    Algo = SAC
    Policy = SACPolicy
    print('Using SAC for continuous action space.')
else:
    # Discrete action spaces -> use DQN
    Algo = DQN
    Policy = DQNPolicy
    print('Using DQN for discrete action space.')

agent = Algo(Policy, wrapped_env, verbose=1, seed=SEED)

Using airl_trainer.reward_test as reward_net.
Using DQN for discrete action space.
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


Visualization of RL Policy before training:

In [11]:
render_mode = "human" if os.environ.get('DISPLAY') else "rgb_array"
env = gym.make("seals:seals/CartPole-v0", render_mode=render_mode)

# Gymnasium returns (obs, info) from reset(); unpack so `obs` is the raw observation
obs, info = env.reset()

for _ in range(100):
    # Pass only the observation (not the (obs, info) tuple) to the SB3 policy
    action, _ = agent.predict(obs, deterministic=True)

    # Gymnasium step returns (obs, reward, terminated, truncated, info)
    obs, rew, terminated, truncated, info = env.step(action)

    # Render depending on mode. For `rgb_array` we collect and display frames inside the notebook.
    if render_mode == "rgb_array":
        frame = env.render()
        # display every 10 frames to avoid excessive output
        if (_ % 10) == 0:
            clear_output(wait=True)
            plt.imshow(frame)
            plt.axis('off')
            display(plt.gcf())
    else:
        env.render()   # opens a window (when DISPLAY is available)

    # episode end when either terminated or truncated
    if terminated or truncated:
        obs, info = env.reset()
        # if running in rgb_array mode, break after one episode to show the frames
        if render_mode == "rgb_array":
            break

env.close()

KeyboardInterrupt: 

Training RL Agent with AIRL Reward Net:

In [12]:
total_timesteps = 5000000
agent.learn(total_timesteps=total_timesteps)

# Evaluation (return_episode_rewards=True returns (episode_rewards, episode_lengths))
episode_rewards, episode_lengths = evaluate_policy(agent, wrapped_env, n_eval_episodes=10, return_episode_rewards=True)
print('Mean return (10 eps):', np.mean(episode_rewards))

# Save agent
agent.save('agent_with_learned_reward')

-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 500       |
|    ep_rew_mean      | -9.86e+03 |
|    exploration_rate | 0.996     |
| time/               |           |
|    episodes         | 4         |
|    fps              | 3478      |
|    time_elapsed     | 0         |
|    total_timesteps  | 2000      |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 500       |
|    ep_rew_mean      | -1.32e+04 |
|    exploration_rate | 0.992     |
| time/               |           |
|    episodes         | 8         |
|    fps              | 3485      |
|    time_elapsed     | 1         |
|    total_timesteps  | 4000      |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 500       |
|    ep_rew_mean      | -1.74e+04 |
|    exploration_rate | 0.989     |
| time/               |     

KeyboardInterrupt: 

Visualize RL Policy, trained with AIRL Reward Net:

In [None]:
render_mode = "human" if os.environ.get('DISPLAY') else "rgb_array"
env = gym.make("seals:seals/CartPole-v0", render_mode=render_mode)

# Gymnasium returns (obs, info) from reset(); unpack so `obs` is the raw observation
obs, info = env.reset()

for _ in range(1000):
    # Pass only the observation (not the (obs, info) tuple) to the SB3 policy
    action, _ = agent.predict(obs, deterministic=True)

    # Gymnasium step returns (obs, reward, terminated, truncated, info)
    obs, rew, terminated, truncated, info = env.step(action)

    # Render depending on mode. For `rgb_array` we collect and display frames inside the notebook.
    if render_mode == "rgb_array":
        frame = env.render()
        # display every 10 frames to avoid excessive output
        if (_ % 10) == 0:
            clear_output(wait=True)
            plt.imshow(frame)
            plt.axis('off')
            display(plt.gcf())
    else:
        env.render()   # opens a window (when DISPLAY is available)

    # episode end when either terminated or truncated
    if terminated or truncated:
        obs, info = env.reset()
        # if running in rgb_array mode, break after one episode to show the frames
        if render_mode == "rgb_array":
            break

env.close()