In [1]:
import random
import torch
from imitation.algorithms import preference_comparisons
from imitation.rewards.reward_nets import BasicRewardNet
from imitation.util.networks import RunningNorm
from imitation.util.util import make_vec_env
from imitation.policies.base import FeedForward32Policy, NormalizeFeaturesExtractor
import gymnasium as gym
from stable_baselines3 import PPO
import numpy as np
import pandas as pd
import seals
from imitation.rewards.reward_wrapper import RewardVecEnvWrapper
from stable_baselines3.common.evaluation import evaluate_policy
from tqdm import tqdm
from wandb.integration.sb3 import WandbCallback
import wandb
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from utils.seed import set_seeds
from seals.util import AbsorbAfterDoneWrapper
from gymnasium.wrappers import TimeLimit
from stable_baselines3.common.vec_env import VecFrameStack

  from pandas.core import (


In [2]:
from imitation_loss.MSELoss import MSELoss
from imitation_loss.PreferenceLoss import PreferenceLoss
from imitation_loss.RatingPreferenceModel import RatingPreferenceModel

In [3]:
def make_env():
    env = gym.make("seals/CartPole-v0")
    #endless_env = AbsorbAfterDoneWrapper(env)
    #limited_env = TimeLimit(endless_env, max_episode_steps=1_000)
    env = Monitor(env)  # record stats such as returns
    return env

In [4]:
rewards = []

def run_experiment(num_comparisons, num_runs=5):
    results = []

    for k in tqdm(range(num_runs)):
        set_seeds(k)
        rng = np.random.default_rng(k)
        venv = DummyVecEnv([make_env])
        run = wandb.init(
            project="rlhf",
            name=f"CartPole_Preference_Improved100_{num_comparisons}_{k}",
            sync_tensorboard=True
        )
        reward_net = BasicRewardNet(
            venv.observation_space, venv.action_space, normalize_input_layer=RunningNorm
        )
        fragmenter = preference_comparisons.RandomFragmenter(
            warning_threshold=0,
            rng=rng
        )
        gatherer = preference_comparisons.SyntheticGatherer(rng=rng, temperature=0)
        preference_model = RatingPreferenceModel(reward_net, list(range(20, 120, 20)), m_mul=100)
        reward_trainer = preference_comparisons.BasicRewardTrainer(
            preference_model=preference_model,
            loss=PreferenceLoss(),
            epochs=10, #10 for Preference, 30 for evaluative
            rng=rng
        )

        agent = PPO(
            policy=FeedForward32Policy,
            policy_kwargs=dict(
                features_extractor_class=NormalizeFeaturesExtractor,
                features_extractor_kwargs=dict(normalize_class=RunningNorm),
            ),
            env=venv,
            seed=k,
            n_steps=2048 // venv.num_envs,
            batch_size=64,
            ent_coef=0.01,
            learning_rate=2e-3,
            clip_range=0.1,
            gae_lambda=0.95,
            gamma=0.97,
            n_epochs=10
        )

        trajectory_generator = preference_comparisons.AgentTrainer(
            algorithm=agent,
            reward_fn=reward_net,
            venv=venv,
            exploration_frac=0.4, # 0.4 for CartPole
            rng=rng,
        )

        pref_comparisons = preference_comparisons.PreferenceComparisons(
            trajectory_generator,
            reward_net,
            num_iterations=60,
            fragmenter=fragmenter,
            preference_gatherer=gatherer,
            reward_trainer=reward_trainer,
            fragment_length=200,
            transition_oversampling=1,
            initial_comparison_frac=0.1,
            allow_variable_horizon=False,
            initial_epoch_multiplier=4,
            query_schedule="hyperbolic"
        )

        if num_comparisons > 0:
            try:
                pref_comparisons.train(
                    total_timesteps=5_000,
                    total_comparisons=num_comparisons,
                )
            except:
                pass

        learned_reward_venv = RewardVecEnvWrapper(venv, reward_net.predict_processed)

        learner = PPO(
            seed=k,
            policy=FeedForward32Policy,
            policy_kwargs=dict(
                features_extractor_class=NormalizeFeaturesExtractor,
                features_extractor_kwargs=dict(normalize_class=RunningNorm),
            ),
            env=learned_reward_venv,
            batch_size=64,
            ent_coef=0.01,
            n_epochs=10,
            n_steps=2048 // learned_reward_venv.num_envs,
            clip_range=0.1,
            gae_lambda=0.95,
            gamma=0.97,
            learning_rate=2e-3,
            tensorboard_log=f"runs/{run.id}"
        )
        learner.learn(500_000, callback=WandbCallback())

        n_eval_episodes = 100
        reward_mean, _ = evaluate_policy(learner.policy, venv, n_eval_episodes)
        results.append(reward_mean)

        run.finish()

        torch.save(reward_net.state_dict(), f'models/reward_net_{num_comparisons}_{k}.pth')

    mean_reward = np.mean(results)
    std_error = np.std(results) / np.sqrt(num_runs)
    return mean_reward, std_error

comparison_counts = [10, 25, 50, 100, 200, 300, 500]
results = {}

for count in comparison_counts:
    mean, std_err = run_experiment(count)
    results[count] = (mean, std_err)


  0%|                                                     | 0/5 [00:00<?, ?it/s][34m[1mwandb[0m: Currently logged in as: [33mmakarov18042003[0m ([33mivan-makarov[0m). Use [1m`wandb login --relogin`[0m to force relogin


Query schedule: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Collecting 2 fragments (400 transitions)
Requested 240 transitions but only 0 in buffer. Sampling 240 additional transitions.
Sampling 160 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 1 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 26.5     |
|    agent/rollout/ep_rew_wrapped_mean | 90.3     |
|    agent/time/fps                    | 4404     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 26.5     |
|    agent/rollout/ep_rew_wrapped_mean | 90.3     |
|    agent/time/fps                    | 4.4e+03  |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.1         |
|    agent/rollout/ep_rew_wrapped_mean | 60.3         |
|    agent/time/fps                    | 4327         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0019720267 |
|    agent/train/clip_fraction         | 0.056        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.692       |
|    agent/train/explained_variance    | -0.494       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0175       |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 36.6        |
|    agent/rollout/ep_rew_wrapped_mean | 25.9        |
|    agent/time/fps                    | 4438        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 6144        |
|    agent/train/approx_kl             | 0.001546226 |
|    agent/train/clip_fraction         | 0.0257      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.69       |
|    agent/train/explained_variance    | 0.756       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0609      |
|    agent/train/n_updates             | 20          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.8         |
|    agent/rollout/ep_rew_wrapped_mean | 10.5         |
|    agent/time/fps                    | 4480         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0018150501 |
|    agent/train/clip_fraction         | 0.0326       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.69        |
|    agent/train/explained_variance    | 0.907        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.024        |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 34.4         |
|    agent/rollout/ep_rew_wrapped_mean | -0.855       |
|    agent/time/fps                    | 4368         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0020874296 |
|    agent/train/clip_fraction         | 0.0626       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.689       |
|    agent/train/explained_variance    | 0.957        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0356       |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.1         |
|    agent/rollout/ep_rew_wrapped_mean | 8.74         |
|    agent/time/fps                    | 4433         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0027789879 |
|    agent/train/clip_fraction         | 0.171        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.679       |
|    agent/train/explained_variance    | 0.966        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0026       |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 30.3       |
|    agent/rollout/ep_rew_wrapped_mean | 0.77       |
|    agent/time/fps                    | 4493       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 14336      |
|    agent/train/approx_kl             | 0.00226836 |
|    agent/train/clip_fraction         | 0.132      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.674     |
|    agent/train/explained_variance    | -0.529     |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.232      |
|    agent/train/n_updates             | 60         |
|    agent/train/policy_gradient_loss  | -0.00434 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.5         |
|    agent/rollout/ep_rew_wrapped_mean | -4.5         |
|    agent/time/fps                    | 4294         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0018482815 |
|    agent/train/clip_fraction         | 0.0901       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.677       |
|    agent/train/explained_variance    | -0.337       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00855      |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.9         |
|    agent/rollout/ep_rew_wrapped_mean | -8.71        |
|    agent/time/fps                    | 4374         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0021927236 |
|    agent/train/clip_fraction         | 0.0873       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.672       |
|    agent/train/explained_variance    | -0.824       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00648     |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 25.5        |
|    agent/rollout/ep_rew_wrapped_mean | -13.1       |
|    agent/time/fps                    | 4490        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 20480       |
|    agent/train/approx_kl             | 0.003123926 |
|    agent/train/clip_fraction         | 0.159       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.677      |
|    agent/train/explained_variance    | 0.555       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00696    |
|    agent/train/n_updates             | 90          |
|    agent/train/policy_gradient_

  from pandas.core import (


VBox(children=(Label(value='0.116 MB of 0.116 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▁▁▁▁▁▁▂▃▄▆▇█████▇█████▇▇▇███████▇▇▇▇████
time/fps,█▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
train/approx_kl,▂▁▃▃▄▃▃▄▃▅█▃▂▄▂▄▃▂▄▁▂▅▃▂▃▄▅▃▂▂▃▇▃▄▆▄▄▅▅▅
train/clip_fraction,▃▁▅▇█▅▇█▅▇▅▆▆▅▆▆▆▄▅▄▄▅▆▄▅▅▅▃▄▃▄▅▄▅▅▃▄▃▄▄
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▁▁▁▃▄▄▃▄▄▅▅▅▆▆▆▆▇▇▇▇▇▅▆▆▆▆▇▇▇▆▇▆▇▇▇▇█▇▇
train/explained_variance,▁███████████████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,471.28
time/fps,2928.0
train/approx_kl,0.00682
train/clip_fraction,0.12261
train/clip_range,0.1
train/entropy_loss,-0.21526
train/explained_variance,0.99907
train/learning_rate,0.002


 20%|████████▊                                   | 1/5 [03:32<14:09, 212.32s/it]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011131113889213238, max=1.0…

Query schedule: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Collecting 2 fragments (400 transitions)
Requested 240 transitions but only 0 in buffer. Sampling 240 additional transitions.
Sampling 160 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 1 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 31.5     |
|    agent/rollout/ep_rew_wrapped_mean | 349      |
|    agent/time/fps                    | 4423     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 31.5     |
|    agent/rollout/ep_rew_wrapped_mean | 349      |
|    agent/time/fps                    | 4.42e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30           |
|    agent/rollout/ep_rew_wrapped_mean | 210          |
|    agent/time/fps                    | 4481         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0017742381 |
|    agent/train/clip_fraction         | 0.0608       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.692       |
|    agent/train/explained_variance    | -0.0828      |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.034        |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.9         |
|    agent/rollout/ep_rew_wrapped_mean | 270          |
|    agent/time/fps                    | 3324         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0031403885 |
|    agent/train/clip_fraction         | 0.0935       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.691       |
|    agent/train/explained_variance    | 0.413        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.327        |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29           |
|    agent/rollout/ep_rew_wrapped_mean | 240          |
|    agent/time/fps                    | 4458         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0028702123 |
|    agent/train/clip_fraction         | 0.122        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.688       |
|    agent/train/explained_variance    | 0.888        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0603       |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.9         |
|    agent/rollout/ep_rew_wrapped_mean | 208          |
|    agent/time/fps                    | 4416         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0025568246 |
|    agent/train/clip_fraction         | 0.143        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.682       |
|    agent/train/explained_variance    | 0.821        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0302       |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.3         |
|    agent/rollout/ep_rew_wrapped_mean | 180          |
|    agent/time/fps                    | 3752         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0033028596 |
|    agent/train/clip_fraction         | 0.156        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.669       |
|    agent/train/explained_variance    | 0.588        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.167        |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.9         |
|    agent/rollout/ep_rew_wrapped_mean | 156          |
|    agent/time/fps                    | 4258         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0025379867 |
|    agent/train/clip_fraction         | 0.125        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.662       |
|    agent/train/explained_variance    | -0.599       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0226       |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.6         |
|    agent/rollout/ep_rew_wrapped_mean | 143          |
|    agent/time/fps                    | 4444         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0018498328 |
|    agent/train/clip_fraction         | 0.102        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.651       |
|    agent/train/explained_variance    | 0.614        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00226     |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 20.6        |
|    agent/rollout/ep_rew_wrapped_mean | 133         |
|    agent/time/fps                    | 4433        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 18432       |
|    agent/train/approx_kl             | 0.003426448 |
|    agent/train/clip_fraction         | 0.188       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.646      |
|    agent/train/explained_variance    | 0.547       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00279    |
|    agent/train/n_updates             | 80          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19.4         |
|    agent/rollout/ep_rew_wrapped_mean | 124          |
|    agent/time/fps                    | 4450         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0023812703 |
|    agent/train/clip_fraction         | 0.103        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.642       |
|    agent/train/explained_variance    | 0.722        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00951     |
|    agent/train/n_updates             | 90           |
|    agent/train



VBox(children=(Label(value='0.114 MB of 0.114 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,█▄▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
time/fps,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/approx_kl,▁▃▂▃▁▃▁▁▄▃▁▅▄▂▃█▂▆▃▂▃▂▅█▄▃▁▄▄▄▅▂▃▄▃▂▄▄▆▇
train/clip_fraction,▁█▄▆▃▆▃▃▅▃▂▂▄▂▂▄▂▂▂▂▂▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▃
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▂▃▅▅▆▇▆▇▇█▇███████████████████████████
train/explained_variance,▁▆▆█████████████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,15.64
time/fps,2919.0
train/approx_kl,0.00889
train/clip_fraction,0.07593
train/clip_range,0.1
train/entropy_loss,-0.0834
train/explained_variance,0.99997
train/learning_rate,0.002


 40%|█████████████████▌                          | 2/5 [07:07<10:41, 213.85s/it]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011137903701617486, max=1.0…

Query schedule: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Collecting 2 fragments (400 transitions)
Requested 240 transitions but only 0 in buffer. Sampling 240 additional transitions.
Sampling 160 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 1 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 33.2     |
|    agent/rollout/ep_rew_wrapped_mean | 145      |
|    agent/time/fps                    | 4326     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 33.2     |
|    agent/rollout/ep_rew_wrapped_mean | 145      |
|    agent/time/fps                    | 4.33e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 27.4        |
|    agent/rollout/ep_rew_wrapped_mean | 114         |
|    agent/time/fps                    | 4247        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 4096        |
|    agent/train/approx_kl             | 0.003535043 |
|    agent/train/clip_fraction         | 0.102       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.69       |
|    agent/train/explained_variance    | 0.521       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0628      |
|    agent/train/n_updates             | 10          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.5         |
|    agent/rollout/ep_rew_wrapped_mean | 130          |
|    agent/time/fps                    | 3197         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0030219667 |
|    agent/train/clip_fraction         | 0.155        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.683       |
|    agent/train/explained_variance    | 0.551        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.48         |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.7         |
|    agent/rollout/ep_rew_wrapped_mean | 135          |
|    agent/time/fps                    | 4307         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0047579864 |
|    agent/train/clip_fraction         | 0.256        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.665       |
|    agent/train/explained_variance    | 0.904        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.215        |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.9         |
|    agent/rollout/ep_rew_wrapped_mean | 151          |
|    agent/time/fps                    | 4344         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0033405875 |
|    agent/train/clip_fraction         | 0.202        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.644       |
|    agent/train/explained_variance    | 0.936        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.38         |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 22.2        |
|    agent/rollout/ep_rew_wrapped_mean | 143         |
|    agent/time/fps                    | 3899        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 12288       |
|    agent/train/approx_kl             | 0.005996757 |
|    agent/train/clip_fraction         | 0.301       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.605      |
|    agent/train/explained_variance    | 0.963       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.361       |
|    agent/train/n_updates             | 50          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.2         |
|    agent/rollout/ep_rew_wrapped_mean | 114          |
|    agent/time/fps                    | 4407         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0041145976 |
|    agent/train/clip_fraction         | 0.131        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.587       |
|    agent/train/explained_variance    | -0.499       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 1.17         |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.6         |
|    agent/rollout/ep_rew_wrapped_mean | 95.5         |
|    agent/time/fps                    | 4414         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0039062733 |
|    agent/train/clip_fraction         | 0.101        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.555       |
|    agent/train/explained_variance    | 0.544        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.301        |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 19.5       |
|    agent/rollout/ep_rew_wrapped_mean | 88.7       |
|    agent/time/fps                    | 4399       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 18432      |
|    agent/train/approx_kl             | 0.01629883 |
|    agent/train/clip_fraction         | 0.299      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.474     |
|    agent/train/explained_variance    | 0.926      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.186      |
|    agent/train/n_updates             | 80         |
|    agent/train/policy_gradient_loss  | -0.0158  

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 18.5        |
|    agent/rollout/ep_rew_wrapped_mean | 84          |
|    agent/time/fps                    | 4397        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 20480       |
|    agent/train/approx_kl             | 0.021544583 |
|    agent/train/clip_fraction         | 0.221       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.386      |
|    agent/train/explained_variance    | 0.949       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.31        |
|    agent/train/n_updates             | 90          |
|    agent/train/policy_gradient_



VBox(children=(Label(value='0.159 MB of 0.159 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,█▄▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▃▃▃▃▂▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁
time/fps,█▅▅▅▄▄▄▄▁▁▂▂▂▂▂▂▂▂▂▂▂▂▃▃▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▃
train/approx_kl,▃▅▂▇▄▆▂▁▁▂▁▂▃▂▁▁▁▂▁▁▃▁▁▁▁▁▁▃▂▂▁█▂▃▂▄▃▅▂▁
train/clip_fraction,█▃▂▃▂▁▂▁▁▁▁▃▂▁▁▁▁▁▁▁▅▁▁▁▁▁▁▂▁▁▁▃▃▁▃▃▄▂▂▁
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▄▆▇▇██████▆▇███████▄██████████▇▄█▆▇▄▇▇█
train/explained_variance,▁███████████████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,8.95
time/fps,2727.0
train/approx_kl,0.00136
train/clip_fraction,0.00474
train/clip_range,0.1
train/entropy_loss,-0.00914
train/explained_variance,0.99965
train/learning_rate,0.002


 60%|██████████████████████████▍                 | 3/5 [10:51<07:16, 218.40s/it]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011168551388093167, max=1.0…

Query schedule: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Collecting 2 fragments (400 transitions)
Requested 240 transitions but only 0 in buffer. Sampling 240 additional transitions.
Sampling 160 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 1 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 26.2     |
|    agent/rollout/ep_rew_wrapped_mean | 296      |
|    agent/time/fps                    | 4174     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
----------------------------------------------------
| mean/                                |           |
|    agent/rollout/ep_len_mean         | 500       |
|    agent/rollout/ep_rew_mean         | 26.2      |
|    agent/rollout/ep_rew_wrapped_mean | 296       |
|    agent/time/fps                    | 4.17e+03  |
|    agent/time/iterations             | 1         |
|    agent/time/time_elapsed           | 0         |
|    agent/time/total_ti

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33           |
|    agent/rollout/ep_rew_wrapped_mean | 169          |
|    agent/time/fps                    | 4244         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0015409454 |
|    agent/train/clip_fraction         | 0.0369       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.692       |
|    agent/train/explained_variance    | 0.0196       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0698       |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27           |
|    agent/rollout/ep_rew_wrapped_mean | 140          |
|    agent/time/fps                    | 4404         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0026496446 |
|    agent/train/clip_fraction         | 0.106        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.689       |
|    agent/train/explained_variance    | 0.766        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.189        |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 26.8        |
|    agent/rollout/ep_rew_wrapped_mean | 183         |
|    agent/time/fps                    | 4437        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 8192        |
|    agent/train/approx_kl             | 0.001616796 |
|    agent/train/clip_fraction         | 0.0664      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.685      |
|    agent/train/explained_variance    | 0.767       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.327       |
|    agent/train/n_updates             | 30          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.2         |
|    agent/rollout/ep_rew_wrapped_mean | 203          |
|    agent/time/fps                    | 3527         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0023115007 |
|    agent/train/clip_fraction         | 0.107        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.681       |
|    agent/train/explained_variance    | 0.902        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.132        |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 23.9        |
|    agent/rollout/ep_rew_wrapped_mean | 196         |
|    agent/time/fps                    | 4183        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 12288       |
|    agent/train/approx_kl             | 0.003389567 |
|    agent/train/clip_fraction         | 0.227       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.672      |
|    agent/train/explained_variance    | 0.995       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0195      |
|    agent/train/n_updates             | 50          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.4         |
|    agent/rollout/ep_rew_wrapped_mean | 199          |
|    agent/time/fps                    | 4427         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0030228456 |
|    agent/train/clip_fraction         | 0.154        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.664       |
|    agent/train/explained_variance    | 0.976        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0548       |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.8         |
|    agent/rollout/ep_rew_wrapped_mean | 195          |
|    agent/time/fps                    | 4051         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0028149025 |
|    agent/train/clip_fraction         | 0.124        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.652       |
|    agent/train/explained_variance    | 0.958        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.018        |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19.9         |
|    agent/rollout/ep_rew_wrapped_mean | 187          |
|    agent/time/fps                    | 4437         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0016622876 |
|    agent/train/clip_fraction         | 0.0708       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.647       |
|    agent/train/explained_variance    | 0.975        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0777       |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 19.1       |
|    agent/rollout/ep_rew_wrapped_mean | 173        |
|    agent/time/fps                    | 4486       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 20480      |
|    agent/train/approx_kl             | 0.00250743 |
|    agent/train/clip_fraction         | 0.0964     |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.644     |
|    agent/train/explained_variance    | 0.86       |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.161      |
|    agent/train/n_updates             | 90         |
|    agent/train/policy_gradient_loss  | -0.00207 



VBox(children=(Label(value='0.147 MB of 0.147 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,█▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
time/fps,█▄▃▃▃▃▃▃▂▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
train/approx_kl,▂▂▁▂▁▄▂▄▂▃▄▂▁▃▂▄▃▂▄▁▃▃▅▂▃▄▃▃▂▂▃▂▃▃▂▇▄█▂▅
train/clip_fraction,█▄▂▂▁▂▁▂▂▂▁▁▂▂▁▂▂▁▂▁▁▂▁▁▂▃▂▂▂▁▂▁▁▂▁▂▂▃▂▁
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▃▅▅▆▆▆▆▇▆▇▇██▇▇▆█▇▇█▇█▇▇▇█▇▇█▇██▇▇█▇▇▇█
train/explained_variance,▁▇██████████████▆███████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,8.28
time/fps,2553.0
train/approx_kl,0.00821
train/clip_fraction,0.10986
train/clip_range,0.1
train/entropy_loss,-0.19934
train/explained_variance,0.99841
train/learning_rate,0.002


 80%|███████████████████████████████████▏        | 4/5 [14:46<03:45, 225.21s/it]

Query schedule: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Collecting 2 fragments (400 transitions)
Requested 240 transitions but only 0 in buffer. Sampling 240 additional transitions.
Sampling 160 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 1 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 33.2     |
|    agent/rollout/ep_rew_wrapped_mean | 25.4     |
|    agent/time/fps                    | 1514     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 1        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 33.2     |
|    agent/rollout/ep_rew_wrapped_mean | 25.4     |
|    agent/time/fps                    | 1.51e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 1        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 33.1        |
|    agent/rollout/ep_rew_wrapped_mean | 44          |
|    agent/time/fps                    | 3753        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 4096        |
|    agent/train/approx_kl             | 0.004333104 |
|    agent/train/clip_fraction         | 0.264       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.689      |
|    agent/train/explained_variance    | 0.319       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0292      |
|    agent/train/n_updates             | 10          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.4         |
|    agent/rollout/ep_rew_wrapped_mean | 24.4         |
|    agent/time/fps                    | 3660         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0048993453 |
|    agent/train/clip_fraction         | 0.402        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.676       |
|    agent/train/explained_variance    | 0.789        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0244       |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.6         |
|    agent/rollout/ep_rew_wrapped_mean | 5.22         |
|    agent/time/fps                    | 4377         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0030950154 |
|    agent/train/clip_fraction         | 0.161        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.665       |
|    agent/train/explained_variance    | 0.805        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0033      |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 23.1        |
|    agent/rollout/ep_rew_wrapped_mean | -8.71       |
|    agent/time/fps                    | 4315        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 10240       |
|    agent/train/approx_kl             | 0.002730233 |
|    agent/train/clip_fraction         | 0.118       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.664      |
|    agent/train/explained_variance    | 0.784       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0484      |
|    agent/train/n_updates             | 40          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.5         |
|    agent/rollout/ep_rew_wrapped_mean | -19.2        |
|    agent/time/fps                    | 4425         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0025930651 |
|    agent/train/clip_fraction         | 0.147        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.672       |
|    agent/train/explained_variance    | 0.843        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0119      |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 22.3       |
|    agent/rollout/ep_rew_wrapped_mean | -24.3      |
|    agent/time/fps                    | 4422       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 14336      |
|    agent/train/approx_kl             | 0.00466277 |
|    agent/train/clip_fraction         | 0.275      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.684     |
|    agent/train/explained_variance    | 0.594      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.0172    |
|    agent/train/n_updates             | 60         |
|    agent/train/policy_gradient_loss  | -0.00785 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.9         |
|    agent/rollout/ep_rew_wrapped_mean | -26.9        |
|    agent/time/fps                    | 4466         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0045715375 |
|    agent/train/clip_fraction         | 0.361        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.685       |
|    agent/train/explained_variance    | 0.856        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0122      |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 24.8        |
|    agent/rollout/ep_rew_wrapped_mean | -26.3       |
|    agent/time/fps                    | 4464        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 18432       |
|    agent/train/approx_kl             | 0.004088922 |
|    agent/train/clip_fraction         | 0.291       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.682      |
|    agent/train/explained_variance    | 0.934       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00254    |
|    agent/train/n_updates             | 80          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.1         |
|    agent/rollout/ep_rew_wrapped_mean | -25.7        |
|    agent/time/fps                    | 4353         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0032660477 |
|    agent/train/clip_fraction         | 0.169        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.675       |
|    agent/train/explained_variance    | 0.925        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0111      |
|    agent/train/n_updates             | 90           |
|    agent/train



VBox(children=(Label(value='0.112 MB of 0.112 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,█▄▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
time/fps,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/approx_kl,▁▂▁▁▂▁▁▂▂▂▂▁▂▁▁▃▂▂▂▂▄▇▁▂▄▂▃▂▁▂▂▄▁▂▆▂█▃█▂
train/clip_fraction,▅█▄▃▄▃▃▂▂▂▂▁▁▂▁▂▂▁▂▁▃▂▁▁▁▁▁▁▁▁▁▂▁▁▂▁▁▂▃▂
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▁▂▂▃▃▄▅▆▇▇▇▇▇██▇███▇█████████████████▇█
train/explained_variance,▁▇██████████████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,14.75
time/fps,2885.0
train/approx_kl,0.00677
train/clip_fraction,0.07109
train/clip_range,0.1
train/entropy_loss,-0.09589
train/explained_variance,0.99998
train/learning_rate,0.002


100%|████████████████████████████████████████████| 5/5 [18:19<00:00, 219.98s/it]
  0%|                                                     | 0/5 [00:00<?, ?it/s]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011169022223394777, max=1.0…

Query schedule: [2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Collecting 4 fragments (800 transitions)
Requested 480 transitions but only 0 in buffer. Sampling 480 additional transitions.
Sampling 320 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 2 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 26.5     |
|    agent/rollout/ep_rew_wrapped_mean | 90.3     |
|    agent/time/fps                    | 4497     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 26.5     |
|    agent/rollout/ep_rew_wrapped_mean | 90.3     |
|    agent/time/fps                    | 4.5e+03  |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.2         |
|    agent/rollout/ep_rew_wrapped_mean | 90.6         |
|    agent/time/fps                    | 4472         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0010685915 |
|    agent/train/clip_fraction         | 0.0598       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.691       |
|    agent/train/explained_variance    | 0.0714       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.622        |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.9         |
|    agent/rollout/ep_rew_wrapped_mean | 30.3         |
|    agent/time/fps                    | 4422         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0020866618 |
|    agent/train/clip_fraction         | 0.0576       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.689       |
|    agent/train/explained_variance    | 0.844        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.248        |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.6         |
|    agent/rollout/ep_rew_wrapped_mean | -31.6        |
|    agent/time/fps                    | 4316         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0013986467 |
|    agent/train/clip_fraction         | 0.0279       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.686       |
|    agent/train/explained_variance    | 0.789        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.489        |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.6         |
|    agent/rollout/ep_rew_wrapped_mean | -61.4        |
|    agent/time/fps                    | 1680         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 1            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0030073235 |
|    agent/train/clip_fraction         | 0.152        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.68        |
|    agent/train/explained_variance    | 0.978        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.129        |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.6         |
|    agent/rollout/ep_rew_wrapped_mean | -77.5        |
|    agent/time/fps                    | 4193         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0026515466 |
|    agent/train/clip_fraction         | 0.124        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.672       |
|    agent/train/explained_variance    | 0.985        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0431       |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.5         |
|    agent/rollout/ep_rew_wrapped_mean | -64.8        |
|    agent/time/fps                    | 2894         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0035356148 |
|    agent/train/clip_fraction         | 0.157        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.648       |
|    agent/train/explained_variance    | 0.902        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.187        |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.2         |
|    agent/rollout/ep_rew_wrapped_mean | -118         |
|    agent/time/fps                    | 4427         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0035551724 |
|    agent/train/clip_fraction         | 0.157        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.637       |
|    agent/train/explained_variance    | 0.93         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.471        |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
--------------------------------------------------------
| raw/                                 |               |
|    agent/rollout/ep_len_mean         | 500           |
|    agent/rollout/ep_rew_mean         | 25.3          |
|    agent/rollout/ep_rew_wrapped_mean | -164          |
|    agent/time/fps                    | 4422          |
|    agent/time/iterations             | 1             |
|    agent/time/time_elapsed           | 0             |
|    agent/time/total_timesteps        | 18432         |
|    agent/train/approx_kl             | 0.00079158624 |
|    agent/train/clip_fraction         | 0.0312        |
|    agent/train/clip_range            | 0.1           |
|    agent/train/entropy_loss          | -0.631        |
|    agent/train/explained_variance    | 0.97          |
|    agent/train/learning_rate         | 0.002         |
|    agent/train/loss                  | 0.635         |
|    agent/train/n_updates             | 80            |

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.9         |
|    agent/rollout/ep_rew_wrapped_mean | -176         |
|    agent/time/fps                    | 4451         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0021738915 |
|    agent/train/clip_fraction         | 0.0953       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.637       |
|    agent/train/explained_variance    | 0.992        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0389       |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.1         |
|    agent/rollout/ep_rew_wrapped_mean | -174         |
|    agent/time/fps                    | 2458         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0012862782 |
|    agent/train/clip_fraction         | 0.0749       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.63        |
|    agent/train/explained_variance    | 0.969        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.138        |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 22.8        |
|    agent/rollout/ep_rew_wrapped_mean | -169        |
|    agent/time/fps                    | 4451        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 24576       |
|    agent/train/approx_kl             | 0.002112459 |
|    agent/train/clip_fraction         | 0.0644      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.631      |
|    agent/train/explained_variance    | 0.977       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.099       |
|    agent/train/n_updates             | 110         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 22.1        |
|    agent/rollout/ep_rew_wrapped_mean | -163        |
|    agent/time/fps                    | 4392        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 26624       |
|    agent/train/approx_kl             | 0.001580723 |
|    agent/train/clip_fraction         | 0.0962      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.621      |
|    agent/train/explained_variance    | 0.983       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.131       |
|    agent/train/n_updates             | 120         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.4         |
|    agent/rollout/ep_rew_wrapped_mean | -160         |
|    agent/time/fps                    | 4470         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 28672        |
|    agent/train/approx_kl             | 0.0018553648 |
|    agent/train/clip_fraction         | 0.0837       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.628       |
|    agent/train/explained_variance    | 0.988        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.15         |
|    agent/train/n_updates             | 130          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.8         |
|    agent/rollout/ep_rew_wrapped_mean | -155         |
|    agent/time/fps                    | 4450         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0023140465 |
|    agent/train/clip_fraction         | 0.113        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.629       |
|    agent/train/explained_variance    | 0.982        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0536       |
|    agent/train/n_updates             | 140          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.5         |
|    agent/rollout/ep_rew_wrapped_mean | -149         |
|    agent/time/fps                    | 4381         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0022929562 |
|    agent/train/clip_fraction         | 0.098        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.609       |
|    agent/train/explained_variance    | 0.978        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0616       |
|    agent/train/n_updates             | 150          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20           |
|    agent/rollout/ep_rew_wrapped_mean | -143         |
|    agent/time/fps                    | 4494         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0018974408 |
|    agent/train/clip_fraction         | 0.0809       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.592       |
|    agent/train/explained_variance    | 0.977        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.104        |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 19.9        |
|    agent/rollout/ep_rew_wrapped_mean | -137        |
|    agent/time/fps                    | 4492        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 36864       |
|    agent/train/approx_kl             | 0.001322885 |
|    agent/train/clip_fraction         | 0.0792      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.595      |
|    agent/train/explained_variance    | 0.973       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0698      |
|    agent/train/n_updates             | 170         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 19.4        |
|    agent/rollout/ep_rew_wrapped_mean | -132        |
|    agent/time/fps                    | 4237        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 38912       |
|    agent/train/approx_kl             | 0.002301998 |
|    agent/train/clip_fraction         | 0.0935      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.588      |
|    agent/train/explained_variance    | 0.974       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.031       |
|    agent/train/n_updates             | 180         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19.3         |
|    agent/rollout/ep_rew_wrapped_mean | -125         |
|    agent/time/fps                    | 4503         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 40960        |
|    agent/train/approx_kl             | 0.0023822908 |
|    agent/train/clip_fraction         | 0.0926       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.58        |
|    agent/train/explained_variance    | 0.931        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0502       |
|    agent/train/n_updates             | 190          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19           |
|    agent/rollout/ep_rew_wrapped_mean | -123         |
|    agent/time/fps                    | 4443         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0017728789 |
|    agent/train/clip_fraction         | 0.0915       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.572       |
|    agent/train/explained_variance    | 0.892        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.073        |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 18.6        |
|    agent/rollout/ep_rew_wrapped_mean | -128        |
|    agent/time/fps                    | 4486        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 45056       |
|    agent/train/approx_kl             | 0.001964681 |
|    agent/train/clip_fraction         | 0.101       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.57       |
|    agent/train/explained_variance    | 0.911       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0241      |
|    agent/train/n_updates             | 210         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.4         |
|    agent/rollout/ep_rew_wrapped_mean | -126         |
|    agent/time/fps                    | 4483         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 47104        |
|    agent/train/approx_kl             | 0.0021159695 |
|    agent/train/clip_fraction         | 0.104        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.569       |
|    agent/train/explained_variance    | 0.959        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0171       |
|    agent/train/n_updates             | 220          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 18.1        |
|    agent/rollout/ep_rew_wrapped_mean | -118        |
|    agent/time/fps                    | 4474        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 49152       |
|    agent/train/approx_kl             | 0.003721651 |
|    agent/train/clip_fraction         | 0.0735      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.563      |
|    agent/train/explained_variance    | 0.97        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0102      |
|    agent/train/n_updates             | 230         |
|    agent/train/policy_gradient_



VBox(children=(Label(value='0.112 MB of 0.112 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,█▅▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
time/fps,█▃▃▃▂▂▂▂▂▂▂▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
train/approx_kl,▂▄▂▂▃▄▂▂▁▂▂▁▂▂▃▄▂▁▅▃▄▃▄▂▂█▂▂▂▂█▂▄▄▃▇▄▁▂▃
train/clip_fraction,▆█▄▃▂▃▂▂▁▂▄▂▃▂▂▃▂▁▃▃▄▁▂▁▂▃▁▁▁▂▃▂▂▁▂▁▂▂▂▁
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▃▅▇▆▆▇▇▇▆▇▆▇▇▇▇▇▇▇▇▇▇█▇▇▇██████████▇██
train/explained_variance,▁▅▇██▇▇███▆▇██████▇█████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,8.46
time/fps,2854.0
train/approx_kl,0.00494
train/clip_fraction,0.07715
train/clip_range,0.1
train/entropy_loss,-0.14648
train/explained_variance,0.9925
train/learning_rate,0.002


 20%|████████▊                                   | 1/5 [03:52<15:31, 232.81s/it]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01117364628913088, max=1.0)…

Query schedule: [2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Collecting 4 fragments (800 transitions)
Requested 480 transitions but only 0 in buffer. Sampling 480 additional transitions.
Sampling 320 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 2 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 31.5     |
|    agent/rollout/ep_rew_wrapped_mean | 349      |
|    agent/time/fps                    | 4240     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 31.5     |
|    agent/rollout/ep_rew_wrapped_mean | 349      |
|    agent/time/fps                    | 4.24e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.2         |
|    agent/rollout/ep_rew_wrapped_mean | 249          |
|    agent/time/fps                    | 2016         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 1            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0019249091 |
|    agent/train/clip_fraction         | 0.0578       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.692       |
|    agent/train/explained_variance    | 0.151        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0915       |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.1         |
|    agent/rollout/ep_rew_wrapped_mean | 275          |
|    agent/time/fps                    | 3521         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0029780609 |
|    agent/train/clip_fraction         | 0.0982       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.69        |
|    agent/train/explained_variance    | 0.252        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.292        |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 28.4        |
|    agent/rollout/ep_rew_wrapped_mean | 269         |
|    agent/time/fps                    | 4263        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 8192        |
|    agent/train/approx_kl             | 0.004603582 |
|    agent/train/clip_fraction         | 0.325       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.685      |
|    agent/train/explained_variance    | 0.932       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.134       |
|    agent/train/n_updates             | 30          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 26.2        |
|    agent/rollout/ep_rew_wrapped_mean | 249         |
|    agent/time/fps                    | 3742        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 10240       |
|    agent/train/approx_kl             | 0.005073825 |
|    agent/train/clip_fraction         | 0.316       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.676      |
|    agent/train/explained_variance    | 0.934       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0445      |
|    agent/train/n_updates             | 40          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.8         |
|    agent/rollout/ep_rew_wrapped_mean | 231          |
|    agent/time/fps                    | 3519         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0036939702 |
|    agent/train/clip_fraction         | 0.204        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.661       |
|    agent/train/explained_variance    | 0.863        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0418       |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.8         |
|    agent/rollout/ep_rew_wrapped_mean | 219          |
|    agent/time/fps                    | 3596         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0050142957 |
|    agent/train/clip_fraction         | 0.211        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.634       |
|    agent/train/explained_variance    | 0.927        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0518       |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 21.4        |
|    agent/rollout/ep_rew_wrapped_mean | 209         |
|    agent/time/fps                    | 4371        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 16384       |
|    agent/train/approx_kl             | 0.007328651 |
|    agent/train/clip_fraction         | 0.273       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.596      |
|    agent/train/explained_variance    | 0.963       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0115     |
|    agent/train/n_updates             | 70          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 20.2        |
|    agent/rollout/ep_rew_wrapped_mean | 193         |
|    agent/time/fps                    | 4442        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 18432       |
|    agent/train/approx_kl             | 0.005636817 |
|    agent/train/clip_fraction         | 0.193       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.572      |
|    agent/train/explained_variance    | 0.81        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0555      |
|    agent/train/n_updates             | 80          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19.1         |
|    agent/rollout/ep_rew_wrapped_mean | 178          |
|    agent/time/fps                    | 3917         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0072211437 |
|    agent/train/clip_fraction         | 0.118        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.57        |
|    agent/train/explained_variance    | 0.912        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0157       |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.1         |
|    agent/rollout/ep_rew_wrapped_mean | 165          |
|    agent/time/fps                    | 4477         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0019398024 |
|    agent/train/clip_fraction         | 0.0898       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.575       |
|    agent/train/explained_variance    | 0.922        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0208       |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17.2         |
|    agent/rollout/ep_rew_wrapped_mean | 153          |
|    agent/time/fps                    | 4320         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0022504218 |
|    agent/train/clip_fraction         | 0.0827       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.57        |
|    agent/train/explained_variance    | 0.937        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0107       |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 16.6       |
|    agent/rollout/ep_rew_wrapped_mean | 144        |
|    agent/time/fps                    | 4365       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 26624      |
|    agent/train/approx_kl             | 0.00832333 |
|    agent/train/clip_fraction         | 0.132      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.541     |
|    agent/train/explained_variance    | 0.942      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.000892   |
|    agent/train/n_updates             | 120        |
|    agent/train/policy_gradient_loss  | -0.00398 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16           |
|    agent/rollout/ep_rew_wrapped_mean | 136          |
|    agent/time/fps                    | 4437         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 28672        |
|    agent/train/approx_kl             | 0.0036468832 |
|    agent/train/clip_fraction         | 0.0854       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.507       |
|    agent/train/explained_variance    | 0.955        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00146     |
|    agent/train/n_updates             | 130          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 15.5        |
|    agent/rollout/ep_rew_wrapped_mean | 129         |
|    agent/time/fps                    | 4354        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 30720       |
|    agent/train/approx_kl             | 0.005886879 |
|    agent/train/clip_fraction         | 0.0935      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.506      |
|    agent/train/explained_variance    | 0.947       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0161      |
|    agent/train/n_updates             | 140         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.1         |
|    agent/rollout/ep_rew_wrapped_mean | 123          |
|    agent/time/fps                    | 4495         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0024410132 |
|    agent/train/clip_fraction         | 0.107        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.484       |
|    agent/train/explained_variance    | 0.97         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00764     |
|    agent/train/n_updates             | 150          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.7         |
|    agent/rollout/ep_rew_wrapped_mean | 117          |
|    agent/time/fps                    | 4417         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0049828473 |
|    agent/train/clip_fraction         | 0.127        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.453       |
|    agent/train/explained_variance    | 0.98         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0141      |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.3        |
|    agent/rollout/ep_rew_wrapped_mean | 112         |
|    agent/time/fps                    | 4512        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 36864       |
|    agent/train/approx_kl             | 0.003831924 |
|    agent/train/clip_fraction         | 0.134       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.433      |
|    agent/train/explained_variance    | 0.975       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0112      |
|    agent/train/n_updates             | 170         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14           |
|    agent/rollout/ep_rew_wrapped_mean | 108          |
|    agent/time/fps                    | 4411         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 38912        |
|    agent/train/approx_kl             | 0.0030180502 |
|    agent/train/clip_fraction         | 0.122        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.414       |
|    agent/train/explained_variance    | 0.986        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0182      |
|    agent/train/n_updates             | 180          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 13.7        |
|    agent/rollout/ep_rew_wrapped_mean | 104         |
|    agent/time/fps                    | 4413        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 40960       |
|    agent/train/approx_kl             | 0.002077365 |
|    agent/train/clip_fraction         | 0.117       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.396      |
|    agent/train/explained_variance    | 0.989       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00581    |
|    agent/train/n_updates             | 190         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 13.5       |
|    agent/rollout/ep_rew_wrapped_mean | 94.8       |
|    agent/time/fps                    | 4397       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 43008      |
|    agent/train/approx_kl             | 0.00322687 |
|    agent/train/clip_fraction         | 0.125      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.403     |
|    agent/train/explained_variance    | 0.99       |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.000811  |
|    agent/train/n_updates             | 200        |
|    agent/train/policy_gradient_loss  | -0.00368 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 13.2         |
|    agent/rollout/ep_rew_wrapped_mean | 85.7         |
|    agent/time/fps                    | 3214         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 45056        |
|    agent/train/approx_kl             | 0.0038105506 |
|    agent/train/clip_fraction         | 0.131        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.38        |
|    agent/train/explained_variance    | 0.986        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00493     |
|    agent/train/n_updates             | 210          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 13          |
|    agent/rollout/ep_rew_wrapped_mean | 71.3        |
|    agent/time/fps                    | 4394        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 47104       |
|    agent/train/approx_kl             | 0.004616929 |
|    agent/train/clip_fraction         | 0.145       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.37       |
|    agent/train/explained_variance    | 0.989       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.000182   |
|    agent/train/n_updates             | 220         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 12.8        |
|    agent/rollout/ep_rew_wrapped_mean | 59.8        |
|    agent/time/fps                    | 4157        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 49152       |
|    agent/train/approx_kl             | 0.002205639 |
|    agent/train/clip_fraction         | 0.0825      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.366      |
|    agent/train/explained_variance    | 0.983       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00979     |
|    agent/train/n_updates             | 230         |
|    agent/train/policy_gradient_



VBox(children=(Label(value='0.112 MB of 0.112 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,█▆▆▅▄▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▂▂▂▂▂▁▁▁▁▁▁▂▂▂▂▂▂
time/fps,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/approx_kl,▃▄▅▃▂▄▁▂▃▂▂▂▁▁▂▂▃▃▂▂▃▄██▄▂▃▄▂▂▅▃▄▃▄▂▃▃▃▃
train/clip_fraction,▇█▅▃▂▂▂▂▂▂▂▁▁▁▂▂▃▄▂▂▃▃▇▄▃▂▂▂▁▂▂▂▃▂▄▂▂▂▂▂
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▆▇▇██▇███████▇▇▇▇█▇█▇▆▇███████▇█▇▇▇███
train/explained_variance,▁▇▇█████████████▇▆███▇█▇███▇██████▇█████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,10.73
time/fps,2909.0
train/approx_kl,0.00282
train/clip_fraction,0.03516
train/clip_range,0.1
train/entropy_loss,-0.06929
train/explained_variance,0.97981
train/learning_rate,0.002


 40%|█████████████████▌                          | 2/5 [07:40<11:29, 229.93s/it]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011172115743263728, max=1.0…

Query schedule: [2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Collecting 4 fragments (800 transitions)
Requested 480 transitions but only 0 in buffer. Sampling 480 additional transitions.
Sampling 320 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 2 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 33.2     |
|    agent/rollout/ep_rew_wrapped_mean | 145      |
|    agent/time/fps                    | 4440     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 33.2     |
|    agent/rollout/ep_rew_wrapped_mean | 145      |
|    agent/time/fps                    | 4.44e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.2         |
|    agent/rollout/ep_rew_wrapped_mean | 122          |
|    agent/time/fps                    | 4364         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0036767886 |
|    agent/train/clip_fraction         | 0.0985       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.69        |
|    agent/train/explained_variance    | 0.471        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0719       |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 24.6        |
|    agent/rollout/ep_rew_wrapped_mean | 118         |
|    agent/time/fps                    | 4439        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 6144        |
|    agent/train/approx_kl             | 0.003802185 |
|    agent/train/clip_fraction         | 0.15        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.681      |
|    agent/train/explained_variance    | 0.638       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.332       |
|    agent/train/n_updates             | 20          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.4         |
|    agent/rollout/ep_rew_wrapped_mean | 123          |
|    agent/time/fps                    | 4378         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0031676453 |
|    agent/train/clip_fraction         | 0.133        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.668       |
|    agent/train/explained_variance    | 0.898        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.164        |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26           |
|    agent/rollout/ep_rew_wrapped_mean | 103          |
|    agent/time/fps                    | 4386         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0046552434 |
|    agent/train/clip_fraction         | 0.241        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.651       |
|    agent/train/explained_variance    | 0.945        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0722       |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.3         |
|    agent/rollout/ep_rew_wrapped_mean | 77.5         |
|    agent/time/fps                    | 4447         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0011474661 |
|    agent/train/clip_fraction         | 0.0131       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.64        |
|    agent/train/explained_variance    | 0.569        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.372        |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 25.7        |
|    agent/rollout/ep_rew_wrapped_mean | 62          |
|    agent/time/fps                    | 4433        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 14336       |
|    agent/train/approx_kl             | 0.001981171 |
|    agent/train/clip_fraction         | 0.0653      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.629      |
|    agent/train/explained_variance    | 0.913       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0147      |
|    agent/train/n_updates             | 60          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.5         |
|    agent/rollout/ep_rew_wrapped_mean | 47.8         |
|    agent/time/fps                    | 4469         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0012597085 |
|    agent/train/clip_fraction         | 0.0268       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.629       |
|    agent/train/explained_variance    | 0.772        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0108       |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.7         |
|    agent/rollout/ep_rew_wrapped_mean | 37.1         |
|    agent/time/fps                    | 4357         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0036845617 |
|    agent/train/clip_fraction         | 0.144        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.607       |
|    agent/train/explained_variance    | 0.777        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0107       |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 23.6        |
|    agent/rollout/ep_rew_wrapped_mean | 29.9        |
|    agent/time/fps                    | 4396        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 20480       |
|    agent/train/approx_kl             | 0.005151813 |
|    agent/train/clip_fraction         | 0.173       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.553      |
|    agent/train/explained_variance    | 0.841       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00433     |
|    agent/train/n_updates             | 90          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.8         |
|    agent/rollout/ep_rew_wrapped_mean | 23.3         |
|    agent/time/fps                    | 4454         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0025856448 |
|    agent/train/clip_fraction         | 0.0947       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.532       |
|    agent/train/explained_variance    | 0.837        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0203       |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.7         |
|    agent/rollout/ep_rew_wrapped_mean | 17.5         |
|    agent/time/fps                    | 4426         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0075975372 |
|    agent/train/clip_fraction         | 0.129        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.497       |
|    agent/train/explained_variance    | 0.905        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.014       |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.8         |
|    agent/rollout/ep_rew_wrapped_mean | 12.3         |
|    agent/time/fps                    | 4408         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 26624        |
|    agent/train/approx_kl             | 0.0012782556 |
|    agent/train/clip_fraction         | 0.0588       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.484       |
|    agent/train/explained_variance    | 0.928        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00414      |
|    agent/train/n_updates             | 120          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20           |
|    agent/rollout/ep_rew_wrapped_mean | 7.29         |
|    agent/time/fps                    | 4285         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 28672        |
|    agent/train/approx_kl             | 0.0017885612 |
|    agent/train/clip_fraction         | 0.0852       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.496       |
|    agent/train/explained_variance    | 0.949        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.000852     |
|    agent/train/n_updates             | 130          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19.3         |
|    agent/rollout/ep_rew_wrapped_mean | 2.8          |
|    agent/time/fps                    | 4450         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0029532902 |
|    agent/train/clip_fraction         | 0.147        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.5         |
|    agent/train/explained_variance    | 0.97         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00209      |
|    agent/train/n_updates             | 140          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 18.7        |
|    agent/rollout/ep_rew_wrapped_mean | -1.15       |
|    agent/time/fps                    | 4471        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 32768       |
|    agent/train/approx_kl             | 0.003376712 |
|    agent/train/clip_fraction         | 0.148       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.502      |
|    agent/train/explained_variance    | 0.973       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.000692    |
|    agent/train/n_updates             | 150         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 18.2        |
|    agent/rollout/ep_rew_wrapped_mean | -4.72       |
|    agent/time/fps                    | 4458        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 34816       |
|    agent/train/approx_kl             | 0.002862323 |
|    agent/train/clip_fraction         | 0.173       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.536      |
|    agent/train/explained_variance    | 0.98        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0237     |
|    agent/train/n_updates             | 160         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17.8         |
|    agent/rollout/ep_rew_wrapped_mean | -7.81        |
|    agent/time/fps                    | 4291         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 36864        |
|    agent/train/approx_kl             | 0.0022466362 |
|    agent/train/clip_fraction         | 0.15         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.547       |
|    agent/train/explained_variance    | 0.975        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0341      |
|    agent/train/n_updates             | 170          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17.4         |
|    agent/rollout/ep_rew_wrapped_mean | -10.5        |
|    agent/time/fps                    | 2911         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 38912        |
|    agent/train/approx_kl             | 0.0024727494 |
|    agent/train/clip_fraction         | 0.183        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.563       |
|    agent/train/explained_variance    | 0.975        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00253      |
|    agent/train/n_updates             | 180          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.9         |
|    agent/rollout/ep_rew_wrapped_mean | -12.8        |
|    agent/time/fps                    | 3792         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 40960        |
|    agent/train/approx_kl             | 0.0023896564 |
|    agent/train/clip_fraction         | 0.153        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.551       |
|    agent/train/explained_variance    | 0.981        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0209      |
|    agent/train/n_updates             | 190          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.6         |
|    agent/rollout/ep_rew_wrapped_mean | -18.2        |
|    agent/time/fps                    | 4357         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0016018591 |
|    agent/train/clip_fraction         | 0.0858       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.562       |
|    agent/train/explained_variance    | 0.971        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0142      |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 16.2        |
|    agent/rollout/ep_rew_wrapped_mean | -26.6       |
|    agent/time/fps                    | 4216        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 45056       |
|    agent/train/approx_kl             | 0.003006332 |
|    agent/train/clip_fraction         | 0.166       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.557      |
|    agent/train/explained_variance    | 0.975       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0116     |
|    agent/train/n_updates             | 210         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.9         |
|    agent/rollout/ep_rew_wrapped_mean | -34.9        |
|    agent/time/fps                    | 4154         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 47104        |
|    agent/train/approx_kl             | 0.0034497194 |
|    agent/train/clip_fraction         | 0.171        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.556       |
|    agent/train/explained_variance    | 0.964        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00127     |
|    agent/train/n_updates             | 220          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.7         |
|    agent/rollout/ep_rew_wrapped_mean | -44.4        |
|    agent/time/fps                    | 4498         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 49152        |
|    agent/train/approx_kl             | 0.0025540218 |
|    agent/train/clip_fraction         | 0.165        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.559       |
|    agent/train/explained_variance    | 0.98         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0189      |
|    agent/train/n_updates             | 230          |
|    agent/train



VBox(children=(Label(value='0.112 MB of 0.112 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▅▂▁▁▂▃▅▆▆▇████▇▆▆▆▆▅▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂
time/fps,█▂▂▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/approx_kl,▁▂▃▁▁▁▁▂▁▅▂▁▂▁▁▁▁▂▃▂▂▂▂▂▃▂▇▂▂▂▂▂▃█▃▄▆▄▃▂
train/clip_fraction,▁█▅▃▁▂▂▂▂▃▂▂▂▁▁▁▁▂▅▃▃▂▂▂▅▂▅▂▂▂▂▁▂▅▂▂▄▃▂▂
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▁▄▆▇▇▇▇▇▇█▇▇▇▇▇██▆▇▇▇▇▇▇▇▇██████▇██▇███
train/explained_variance,▁▇▇██████████████████████████████▇██████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,17.76
time/fps,2883.0
train/approx_kl,0.00496
train/clip_fraction,0.06792
train/clip_range,0.1
train/entropy_loss,-0.09931
train/explained_variance,0.98188
train/learning_rate,0.002


 60%|██████████████████████████▍                 | 3/5 [11:26<07:35, 227.96s/it]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011170556944691473, max=1.0…

Query schedule: [2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Collecting 4 fragments (800 transitions)
Requested 480 transitions but only 0 in buffer. Sampling 480 additional transitions.
Sampling 320 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 2 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 26.2     |
|    agent/rollout/ep_rew_wrapped_mean | 296      |
|    agent/time/fps                    | 4451     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 26.2     |
|    agent/rollout/ep_rew_wrapped_mean | 296      |
|    agent/time/fps                    | 4.45e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 37.6        |
|    agent/rollout/ep_rew_wrapped_mean | 98.8        |
|    agent/time/fps                    | 2533        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 4096        |
|    agent/train/approx_kl             | 0.003818913 |
|    agent/train/clip_fraction         | 0.17        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.69       |
|    agent/train/explained_variance    | -0.285      |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00143     |
|    agent/train/n_updates             | 10          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.8         |
|    agent/rollout/ep_rew_wrapped_mean | 72.3         |
|    agent/time/fps                    | 3195         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0037757054 |
|    agent/train/clip_fraction         | 0.0857       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.682       |
|    agent/train/explained_variance    | 0.318        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0479       |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.5         |
|    agent/rollout/ep_rew_wrapped_mean | 68.1         |
|    agent/time/fps                    | 4407         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0016808145 |
|    agent/train/clip_fraction         | 0.0306       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.681       |
|    agent/train/explained_variance    | 0.799        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0472       |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.1         |
|    agent/rollout/ep_rew_wrapped_mean | 71           |
|    agent/time/fps                    | 4449         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0023503467 |
|    agent/train/clip_fraction         | 0.0897       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.679       |
|    agent/train/explained_variance    | 0.885        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0189       |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.5         |
|    agent/rollout/ep_rew_wrapped_mean | 62           |
|    agent/time/fps                    | 4348         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0037144246 |
|    agent/train/clip_fraction         | 0.195        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.671       |
|    agent/train/explained_variance    | 0.961        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00865      |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.5         |
|    agent/rollout/ep_rew_wrapped_mean | 56.3         |
|    agent/time/fps                    | 4361         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0028218152 |
|    agent/train/clip_fraction         | 0.16         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.678       |
|    agent/train/explained_variance    | 0.911        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0104       |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 24.1        |
|    agent/rollout/ep_rew_wrapped_mean | 51          |
|    agent/time/fps                    | 4437        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 16384       |
|    agent/train/approx_kl             | 0.002412293 |
|    agent/train/clip_fraction         | 0.096       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.672      |
|    agent/train/explained_variance    | 0.614       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00803    |
|    agent/train/n_updates             | 70          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 24.1        |
|    agent/rollout/ep_rew_wrapped_mean | 46.7        |
|    agent/time/fps                    | 4431        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 18432       |
|    agent/train/approx_kl             | 0.002953715 |
|    agent/train/clip_fraction         | 0.155       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.665      |
|    agent/train/explained_variance    | 0.855       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.000825   |
|    agent/train/n_updates             | 80          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.2         |
|    agent/rollout/ep_rew_wrapped_mean | 42.5         |
|    agent/time/fps                    | 4412         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0031115874 |
|    agent/train/clip_fraction         | 0.176        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.655       |
|    agent/train/explained_variance    | 0.813        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00567     |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 24.1        |
|    agent/rollout/ep_rew_wrapped_mean | 38.3        |
|    agent/time/fps                    | 4440        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 22528       |
|    agent/train/approx_kl             | 0.002289935 |
|    agent/train/clip_fraction         | 0.123       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.652      |
|    agent/train/explained_variance    | 0.772       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0154     |
|    agent/train/n_updates             | 100         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.8         |
|    agent/rollout/ep_rew_wrapped_mean | 35           |
|    agent/time/fps                    | 4345         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0020098637 |
|    agent/train/clip_fraction         | 0.094        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.648       |
|    agent/train/explained_variance    | 0.196        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00815     |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.9         |
|    agent/rollout/ep_rew_wrapped_mean | 32.5         |
|    agent/time/fps                    | 4465         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 26624        |
|    agent/train/approx_kl             | 0.0035148342 |
|    agent/train/clip_fraction         | 0.254        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.645       |
|    agent/train/explained_variance    | 0.828        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0209      |
|    agent/train/n_updates             | 120          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 23.6        |
|    agent/rollout/ep_rew_wrapped_mean | 30.6        |
|    agent/time/fps                    | 4449        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 28672       |
|    agent/train/approx_kl             | 0.003512192 |
|    agent/train/clip_fraction         | 0.172       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.64       |
|    agent/train/explained_variance    | 0.373       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00129    |
|    agent/train/n_updates             | 130         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.8         |
|    agent/rollout/ep_rew_wrapped_mean | 29.1         |
|    agent/time/fps                    | 4240         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0038515828 |
|    agent/train/clip_fraction         | 0.227        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.637       |
|    agent/train/explained_variance    | 0.701        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00027     |
|    agent/train/n_updates             | 140          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22           |
|    agent/rollout/ep_rew_wrapped_mean | 27.7         |
|    agent/time/fps                    | 4331         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0036046149 |
|    agent/train/clip_fraction         | 0.145        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.627       |
|    agent/train/explained_variance    | 0.861        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0012      |
|    agent/train/n_updates             | 150          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 21.2        |
|    agent/rollout/ep_rew_wrapped_mean | 26.4        |
|    agent/time/fps                    | 4422        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 34816       |
|    agent/train/approx_kl             | 0.002125977 |
|    agent/train/clip_fraction         | 0.125       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.629      |
|    agent/train/explained_variance    | 0.881       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0104     |
|    agent/train/n_updates             | 160         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.7         |
|    agent/rollout/ep_rew_wrapped_mean | 25.2         |
|    agent/time/fps                    | 4474         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 36864        |
|    agent/train/approx_kl             | 0.0039415043 |
|    agent/train/clip_fraction         | 0.202        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.621       |
|    agent/train/explained_variance    | 0.936        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0289      |
|    agent/train/n_updates             | 170          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.1         |
|    agent/rollout/ep_rew_wrapped_mean | 23.8         |
|    agent/time/fps                    | 4498         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 38912        |
|    agent/train/approx_kl             | 0.0024688733 |
|    agent/train/clip_fraction         | 0.139        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.622       |
|    agent/train/explained_variance    | 0.761        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00148     |
|    agent/train/n_updates             | 180          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 19.6        |
|    agent/rollout/ep_rew_wrapped_mean | 21.8        |
|    agent/time/fps                    | 4298        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 40960       |
|    agent/train/approx_kl             | 0.004143563 |
|    agent/train/clip_fraction         | 0.264       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.614      |
|    agent/train/explained_variance    | 0.915       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00993    |
|    agent/train/n_updates             | 190         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19           |
|    agent/rollout/ep_rew_wrapped_mean | 14.6         |
|    agent/time/fps                    | 3681         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0044649704 |
|    agent/train/clip_fraction         | 0.249        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.594       |
|    agent/train/explained_variance    | 0.922        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00942     |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.6         |
|    agent/rollout/ep_rew_wrapped_mean | 13           |
|    agent/time/fps                    | 4172         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 45056        |
|    agent/train/approx_kl             | 0.0049845967 |
|    agent/train/clip_fraction         | 0.27         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.549       |
|    agent/train/explained_variance    | 0.92         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0255      |
|    agent/train/n_updates             | 210          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 18.1        |
|    agent/rollout/ep_rew_wrapped_mean | 10.7        |
|    agent/time/fps                    | 3350        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 47104       |
|    agent/train/approx_kl             | 0.007459202 |
|    agent/train/clip_fraction         | 0.211       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.508      |
|    agent/train/explained_variance    | 0.948       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0332     |
|    agent/train/n_updates             | 220         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17.8         |
|    agent/rollout/ep_rew_wrapped_mean | 7.2          |
|    agent/time/fps                    | 3493         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 49152        |
|    agent/train/approx_kl             | 0.0052155955 |
|    agent/train/clip_fraction         | 0.213        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.496       |
|    agent/train/explained_variance    | 0.946        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00748     |
|    agent/train/n_updates             | 230          |
|    agent/train



VBox(children=(Label(value='0.161 MB of 0.161 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,█▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
time/fps,▇▁▄▅▆▆▆▇▇▆▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇█████████▆▇▇
train/approx_kl,▁▂▁▁▂▃▁▂▂▁▁▂▁▂▂▁▁▂▂▂▂▂▂▃▂▃▂▃▂▂▂▂▁▂█▂▂▂▁▂
train/clip_fraction,█▆▅▆▅▅▄▃▃▃▄▄▃▂▂▂▁▃▃▃▂▂▂▁▂▁▂▃▃▁▂▃▁▃▃▁▃▃▂▃
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▃▃▅▆▆▇▇▆▆▆▇▇▇▇█▇▇▇███████▇▇██▇█████▇██
train/explained_variance,▁███████████████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,9.06
time/fps,2762.0
train/approx_kl,0.00576
train/clip_fraction,0.10679
train/clip_range,0.1
train/entropy_loss,-0.1518
train/explained_variance,0.99979
train/learning_rate,0.002


 80%|███████████████████████████████████▏        | 4/5 [15:25<03:52, 232.31s/it]

Query schedule: [2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Collecting 4 fragments (800 transitions)
Requested 480 transitions but only 0 in buffer. Sampling 480 additional transitions.
Sampling 320 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 2 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 33.2     |
|    agent/rollout/ep_rew_wrapped_mean | 25.4     |
|    agent/time/fps                    | 4458     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 33.2     |
|    agent/rollout/ep_rew_wrapped_mean | 25.4     |
|    agent/time/fps                    | 4.46e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.4         |
|    agent/rollout/ep_rew_wrapped_mean | 199          |
|    agent/time/fps                    | 4227         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0046959072 |
|    agent/train/clip_fraction         | 0.291        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.689       |
|    agent/train/explained_variance    | 0.0595       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.91         |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 25.6        |
|    agent/rollout/ep_rew_wrapped_mean | 217         |
|    agent/time/fps                    | 4447        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 6144        |
|    agent/train/approx_kl             | 0.004699435 |
|    agent/train/clip_fraction         | 0.279       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.676      |
|    agent/train/explained_variance    | 0.682       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 2.37        |
|    agent/train/n_updates             | 20          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 24.7        |
|    agent/rollout/ep_rew_wrapped_mean | 284         |
|    agent/time/fps                    | 4471        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 8192        |
|    agent/train/approx_kl             | 0.004475315 |
|    agent/train/clip_fraction         | 0.296       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.653      |
|    agent/train/explained_variance    | 0.65        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 2.5         |
|    agent/train/n_updates             | 30          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 22.2        |
|    agent/rollout/ep_rew_wrapped_mean | 352         |
|    agent/time/fps                    | 4448        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 10240       |
|    agent/train/approx_kl             | 0.004092578 |
|    agent/train/clip_fraction         | 0.237       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.628      |
|    agent/train/explained_variance    | 0.729       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 2.62        |
|    agent/train/n_updates             | 40          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 21.4        |
|    agent/rollout/ep_rew_wrapped_mean | 367         |
|    agent/time/fps                    | 4246        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 12288       |
|    agent/train/approx_kl             | 0.006318054 |
|    agent/train/clip_fraction         | 0.328       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.583      |
|    agent/train/explained_variance    | 0.906       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.979       |
|    agent/train/n_updates             | 50          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.9         |
|    agent/rollout/ep_rew_wrapped_mean | 388          |
|    agent/time/fps                    | 4371         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0078968685 |
|    agent/train/clip_fraction         | 0.262        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.517       |
|    agent/train/explained_variance    | 0.908        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.815        |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19.4         |
|    agent/rollout/ep_rew_wrapped_mean | 396          |
|    agent/time/fps                    | 4432         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0130565725 |
|    agent/train/clip_fraction         | 0.284        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.46        |
|    agent/train/explained_variance    | 0.877        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 1.53         |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 18.5        |
|    agent/rollout/ep_rew_wrapped_mean | 422         |
|    agent/time/fps                    | 4323        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 18432       |
|    agent/train/approx_kl             | 0.014048288 |
|    agent/train/clip_fraction         | 0.214       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.367      |
|    agent/train/explained_variance    | 0.938       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.76        |
|    agent/train/n_updates             | 80          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 17.6        |
|    agent/rollout/ep_rew_wrapped_mean | 437         |
|    agent/time/fps                    | 4381        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 20480       |
|    agent/train/approx_kl             | 0.021101013 |
|    agent/train/clip_fraction         | 0.166       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.27       |
|    agent/train/explained_variance    | 0.961       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.63        |
|    agent/train/n_updates             | 90          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.8         |
|    agent/rollout/ep_rew_wrapped_mean | 446          |
|    agent/time/fps                    | 4312         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0053820047 |
|    agent/train/clip_fraction         | 0.0797       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.262       |
|    agent/train/explained_variance    | 0.946        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.502        |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 16          |
|    agent/rollout/ep_rew_wrapped_mean | 452         |
|    agent/time/fps                    | 4394        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 24576       |
|    agent/train/approx_kl             | 0.012136709 |
|    agent/train/clip_fraction         | 0.0803      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.224      |
|    agent/train/explained_variance    | 0.974       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.394       |
|    agent/train/n_updates             | 110         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.4         |
|    agent/rollout/ep_rew_wrapped_mean | 454          |
|    agent/time/fps                    | 4463         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 26624        |
|    agent/train/approx_kl             | 0.0036221573 |
|    agent/train/clip_fraction         | 0.052        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.201       |
|    agent/train/explained_variance    | 0.977        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.229        |
|    agent/train/n_updates             | 120          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.9         |
|    agent/rollout/ep_rew_wrapped_mean | 457          |
|    agent/time/fps                    | 4475         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 28672        |
|    agent/train/approx_kl             | 0.0049707843 |
|    agent/train/clip_fraction         | 0.0546       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.164       |
|    agent/train/explained_variance    | 0.987        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.255        |
|    agent/train/n_updates             | 130          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.5        |
|    agent/rollout/ep_rew_wrapped_mean | 459         |
|    agent/time/fps                    | 4428        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 30720       |
|    agent/train/approx_kl             | 0.004497735 |
|    agent/train/clip_fraction         | 0.065       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.134      |
|    agent/train/explained_variance    | 0.994       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.186       |
|    agent/train/n_updates             | 140         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.1        |
|    agent/rollout/ep_rew_wrapped_mean | 456         |
|    agent/time/fps                    | 4491        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 32768       |
|    agent/train/approx_kl             | 0.007726698 |
|    agent/train/clip_fraction         | 0.0396      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.0888     |
|    agent/train/explained_variance    | 0.982       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.306       |
|    agent/train/n_updates             | 150         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 13.7         |
|    agent/rollout/ep_rew_wrapped_mean | 447          |
|    agent/time/fps                    | 4460         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0041379025 |
|    agent/train/clip_fraction         | 0.0157       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.0457      |
|    agent/train/explained_variance    | 0.974        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0532       |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 13.4         |
|    agent/rollout/ep_rew_wrapped_mean | 437          |
|    agent/time/fps                    | 4506         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 36864        |
|    agent/train/approx_kl             | 0.0025133772 |
|    agent/train/clip_fraction         | 0.0315       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.103       |
|    agent/train/explained_variance    | 0.967        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.161        |
|    agent/train/n_updates             | 170          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 13.1         |
|    agent/rollout/ep_rew_wrapped_mean | 426          |
|    agent/time/fps                    | 4509         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 38912        |
|    agent/train/approx_kl             | 0.0004049328 |
|    agent/train/clip_fraction         | 0.00791      |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.0442      |
|    agent/train/explained_variance    | 0.978        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.06         |
|    agent/train/n_updates             | 180          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 13         |
|    agent/rollout/ep_rew_wrapped_mean | 411        |
|    agent/time/fps                    | 4331       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 40960      |
|    agent/train/approx_kl             | 0.16373308 |
|    agent/train/clip_fraction         | 0.056      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.0367    |
|    agent/train/explained_variance    | 0.948      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.0301    |
|    agent/train/n_updates             | 190        |
|    agent/train/policy_gradient_loss  | -0.00439 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 12.8        |
|    agent/rollout/ep_rew_wrapped_mean | 357         |
|    agent/time/fps                    | 4494        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 43008       |
|    agent/train/approx_kl             | 0.005583229 |
|    agent/train/clip_fraction         | 0.0492      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.318      |
|    agent/train/explained_variance    | 0.678       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 4.03        |
|    agent/train/n_updates             | 200         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 12.8         |
|    agent/rollout/ep_rew_wrapped_mean | 303          |
|    agent/time/fps                    | 4401         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 45056        |
|    agent/train/approx_kl             | 0.0033564074 |
|    agent/train/clip_fraction         | 0.0671       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.362       |
|    agent/train/explained_variance    | 0.907        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 1.47         |
|    agent/train/n_updates             | 210          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 12.8         |
|    agent/rollout/ep_rew_wrapped_mean | 254          |
|    agent/time/fps                    | 4369         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 47104        |
|    agent/train/approx_kl             | 0.0067494903 |
|    agent/train/clip_fraction         | 0.124        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.446       |
|    agent/train/explained_variance    | 0.959        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 1.19         |
|    agent/train/n_updates             | 220          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 12.9        |
|    agent/rollout/ep_rew_wrapped_mean | 198         |
|    agent/time/fps                    | 4439        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 49152       |
|    agent/train/approx_kl             | 0.009834214 |
|    agent/train/clip_fraction         | 0.267       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.566      |
|    agent/train/explained_variance    | 0.978       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 1.04        |
|    agent/train/n_updates             | 230         |
|    agent/train/policy_gradient_



VBox(children=(Label(value='0.111 MB of 0.111 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,█▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
time/fps,█▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/approx_kl,▃▇▆▂▅▁█▂▃▂▂▃▁▂▂▂▂▁▂▃▃▁▂▂▆▁▁▄▁▂▃▂▂▂▁▂▃▁▁▁
train/clip_fraction,█▃▂▂▁▁▁▁▁▁▂▂▁▁▁▁▁▁▂▂▁▁▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▄▇▇█████▇█▇███████▇███▆███▇████████████
train/explained_variance,▁▇███████▇█████████▇██▆▇███▇███████▇████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,8.42
time/fps,2859.0
train/approx_kl,0.00056
train/clip_fraction,0.01411
train/clip_range,0.1
train/entropy_loss,-0.01167
train/explained_variance,0.9997
train/learning_rate,0.002


100%|████████████████████████████████████████████| 5/5 [19:16<00:00, 231.37s/it]
  0%|                                                     | 0/5 [00:00<?, ?it/s]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01116876897867769, max=1.0)…

Query schedule: [5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Collecting 10 fragments (2000 transitions)
Requested 1200 transitions but only 0 in buffer. Sampling 1200 additional transitions.
Sampling 800 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 5 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 35.2     |
|    agent/rollout/ep_rew_wrapped_mean | 98.8     |
|    agent/time/fps                    | 4504     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 35.2     |
|    agent/rollout/ep_rew_wrapped_mean | 98.8     |
|    agent/time/fps                    | 4.5e+03  |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 34.4         |
|    agent/rollout/ep_rew_wrapped_mean | 13.8         |
|    agent/time/fps                    | 4394         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0017461178 |
|    agent/train/clip_fraction         | 0.037        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.692       |
|    agent/train/explained_variance    | 0.0992       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0353       |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 34.8         |
|    agent/rollout/ep_rew_wrapped_mean | -0.476       |
|    agent/time/fps                    | 4473         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0026698664 |
|    agent/train/clip_fraction         | 0.128        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.689       |
|    agent/train/explained_variance    | 0.828        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0106       |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 38.4         |
|    agent/rollout/ep_rew_wrapped_mean | -2.51        |
|    agent/time/fps                    | 4482         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0032962314 |
|    agent/train/clip_fraction         | 0.163        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.687       |
|    agent/train/explained_variance    | 0.9          |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00286      |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 35.2        |
|    agent/rollout/ep_rew_wrapped_mean | -10.9       |
|    agent/time/fps                    | 4463        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 10240       |
|    agent/train/approx_kl             | 0.002717305 |
|    agent/train/clip_fraction         | 0.118       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.684      |
|    agent/train/explained_variance    | 0.704       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.058       |
|    agent/train/n_updates             | 40          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.3         |
|    agent/rollout/ep_rew_wrapped_mean | -30.4        |
|    agent/time/fps                    | 4167         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0022819485 |
|    agent/train/clip_fraction         | 0.103        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.683       |
|    agent/train/explained_variance    | 0.607        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.349        |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.3         |
|    agent/rollout/ep_rew_wrapped_mean | -54.9        |
|    agent/time/fps                    | 4430         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0022672014 |
|    agent/train/clip_fraction         | 0.0767       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.685       |
|    agent/train/explained_variance    | 0.883        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 1.53         |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.1         |
|    agent/rollout/ep_rew_wrapped_mean | -54.7        |
|    agent/time/fps                    | 4368         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0035472517 |
|    agent/train/clip_fraction         | 0.197        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.676       |
|    agent/train/explained_variance    | 0.964        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0334       |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.9         |
|    agent/rollout/ep_rew_wrapped_mean | -51          |
|    agent/time/fps                    | 4322         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0040259217 |
|    agent/train/clip_fraction         | 0.262        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.68        |
|    agent/train/explained_variance    | 0.897        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00165     |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.5         |
|    agent/rollout/ep_rew_wrapped_mean | -46.2        |
|    agent/time/fps                    | 4323         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0032467945 |
|    agent/train/clip_fraction         | 0.194        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.676       |
|    agent/train/explained_variance    | 0.837        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00541      |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.8         |
|    agent/rollout/ep_rew_wrapped_mean | -40.1        |
|    agent/time/fps                    | 4454         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0039008937 |
|    agent/train/clip_fraction         | 0.256        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.674       |
|    agent/train/explained_variance    | 0.884        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0119      |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.9         |
|    agent/rollout/ep_rew_wrapped_mean | -35.5        |
|    agent/time/fps                    | 4391         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0046281805 |
|    agent/train/clip_fraction         | 0.331        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.67        |
|    agent/train/explained_variance    | 0.94         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00606     |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.5         |
|    agent/rollout/ep_rew_wrapped_mean | -34.1        |
|    agent/time/fps                    | 4181         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 26624        |
|    agent/train/approx_kl             | 0.0025163386 |
|    agent/train/clip_fraction         | 0.109        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.676       |
|    agent/train/explained_variance    | 0.903        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00243     |
|    agent/train/n_updates             | 120          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.6         |
|    agent/rollout/ep_rew_wrapped_mean | -32.5        |
|    agent/time/fps                    | 4478         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 28672        |
|    agent/train/approx_kl             | 0.0025412566 |
|    agent/train/clip_fraction         | 0.0686       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.668       |
|    agent/train/explained_variance    | 0.905        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0133      |
|    agent/train/n_updates             | 130          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.3         |
|    agent/rollout/ep_rew_wrapped_mean | -31.1        |
|    agent/time/fps                    | 4480         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0035259386 |
|    agent/train/clip_fraction         | 0.19         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.661       |
|    agent/train/explained_variance    | 0.897        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00527     |
|    agent/train/n_updates             | 140          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31           |
|    agent/rollout/ep_rew_wrapped_mean | -29.5        |
|    agent/time/fps                    | 4451         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0024393597 |
|    agent/train/clip_fraction         | 0.143        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.665       |
|    agent/train/explained_variance    | 0.968        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00999     |
|    agent/train/n_updates             | 150          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.4         |
|    agent/rollout/ep_rew_wrapped_mean | -27.9        |
|    agent/time/fps                    | 4488         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0034831292 |
|    agent/train/clip_fraction         | 0.234        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.66        |
|    agent/train/explained_variance    | 0.954        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0151      |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.8         |
|    agent/rollout/ep_rew_wrapped_mean | -26.2        |
|    agent/time/fps                    | 4386         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 36864        |
|    agent/train/approx_kl             | 0.0023146262 |
|    agent/train/clip_fraction         | 0.113        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.663       |
|    agent/train/explained_variance    | 0.89         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00172      |
|    agent/train/n_updates             | 170          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.2         |
|    agent/rollout/ep_rew_wrapped_mean | -30.2        |
|    agent/time/fps                    | 4473         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 38912        |
|    agent/train/approx_kl             | 0.0018910599 |
|    agent/train/clip_fraction         | 0.121        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.666       |
|    agent/train/explained_variance    | -3.22        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.159        |
|    agent/train/n_updates             | 180          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.7         |
|    agent/rollout/ep_rew_wrapped_mean | -33.7        |
|    agent/time/fps                    | 4446         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 40960        |
|    agent/train/approx_kl             | 0.0020532063 |
|    agent/train/clip_fraction         | 0.0838       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.658       |
|    agent/train/explained_variance    | -0.457       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0191       |
|    agent/train/n_updates             | 190          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.5         |
|    agent/rollout/ep_rew_wrapped_mean | -42.8        |
|    agent/time/fps                    | 4512         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0019619048 |
|    agent/train/clip_fraction         | 0.117        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.659       |
|    agent/train/explained_variance    | 0.673        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0518       |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.1         |
|    agent/rollout/ep_rew_wrapped_mean | -43.7        |
|    agent/time/fps                    | 4493         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 45056        |
|    agent/train/approx_kl             | 0.0021840401 |
|    agent/train/clip_fraction         | 0.116        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.657       |
|    agent/train/explained_variance    | 0.926        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0087      |
|    agent/train/n_updates             | 210          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 27.8        |
|    agent/rollout/ep_rew_wrapped_mean | -46.4       |
|    agent/time/fps                    | 4334        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 47104       |
|    agent/train/approx_kl             | 0.002952848 |
|    agent/train/clip_fraction         | 0.135       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.655      |
|    agent/train/explained_variance    | 0.975       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0225     |
|    agent/train/n_updates             | 220         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.3         |
|    agent/rollout/ep_rew_wrapped_mean | -50.4        |
|    agent/time/fps                    | 4501         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 49152        |
|    agent/train/approx_kl             | 0.0034658813 |
|    agent/train/clip_fraction         | 0.19         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.645       |
|    agent/train/explained_variance    | 0.99         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0125      |
|    agent/train/n_updates             | 230          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27           |
|    agent/rollout/ep_rew_wrapped_mean | -52.5        |
|    agent/time/fps                    | 4418         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 51200        |
|    agent/train/approx_kl             | 0.0038493718 |
|    agent/train/clip_fraction         | 0.199        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.637       |
|    agent/train/explained_variance    | 0.984        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00887     |
|    agent/train/n_updates             | 240          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 26.4        |
|    agent/rollout/ep_rew_wrapped_mean | -50.3       |
|    agent/time/fps                    | 3946        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 53248       |
|    agent/train/approx_kl             | 0.005869685 |
|    agent/train/clip_fraction         | 0.239       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.617      |
|    agent/train/explained_variance    | 0.994       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00162    |
|    agent/train/n_updates             | 250         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.6         |
|    agent/rollout/ep_rew_wrapped_mean | -44.7        |
|    agent/time/fps                    | 4031         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 55296        |
|    agent/train/approx_kl             | 0.0031738142 |
|    agent/train/clip_fraction         | 0.161        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.618       |
|    agent/train/explained_variance    | 0.98         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00477      |
|    agent/train/n_updates             | 260          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 24.9        |
|    agent/rollout/ep_rew_wrapped_mean | -46.6       |
|    agent/time/fps                    | 4506        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 57344       |
|    agent/train/approx_kl             | 0.003519323 |
|    agent/train/clip_fraction         | 0.144       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.616      |
|    agent/train/explained_variance    | 0.988       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00183     |
|    agent/train/n_updates             | 270         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.5         |
|    agent/rollout/ep_rew_wrapped_mean | -50          |
|    agent/time/fps                    | 4094         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 59392        |
|    agent/train/approx_kl             | 0.0042010034 |
|    agent/train/clip_fraction         | 0.203        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.607       |
|    agent/train/explained_variance    | 0.967        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00397      |
|    agent/train/n_updates             | 280          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.3         |
|    agent/rollout/ep_rew_wrapped_mean | -53.3        |
|    agent/time/fps                    | 4503         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 61440        |
|    agent/train/approx_kl             | 0.0051678224 |
|    agent/train/clip_fraction         | 0.245        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.594       |
|    agent/train/explained_variance    | 0.991        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0275      |
|    agent/train/n_updates             | 290          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.1         |
|    agent/rollout/ep_rew_wrapped_mean | -57.8        |
|    agent/time/fps                    | 4485         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 63488        |
|    agent/train/approx_kl             | 0.0042681247 |
|    agent/train/clip_fraction         | 0.2          |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.585       |
|    agent/train/explained_variance    | 0.996        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00315     |
|    agent/train/n_updates             | 300          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.9         |
|    agent/rollout/ep_rew_wrapped_mean | -61.7        |
|    agent/time/fps                    | 4481         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 65536        |
|    agent/train/approx_kl             | 0.0040849266 |
|    agent/train/clip_fraction         | 0.188        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.575       |
|    agent/train/explained_variance    | 0.997        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0151      |
|    agent/train/n_updates             | 310          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 22.8        |
|    agent/rollout/ep_rew_wrapped_mean | -63.6       |
|    agent/time/fps                    | 4525        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 67584       |
|    agent/train/approx_kl             | 0.004362932 |
|    agent/train/clip_fraction         | 0.179       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.565      |
|    agent/train/explained_variance    | 0.99        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0102     |
|    agent/train/n_updates             | 320         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.7         |
|    agent/rollout/ep_rew_wrapped_mean | -65.8        |
|    agent/time/fps                    | 4129         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 69632        |
|    agent/train/approx_kl             | 0.0045049833 |
|    agent/train/clip_fraction         | 0.211        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.558       |
|    agent/train/explained_variance    | 0.985        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0195      |
|    agent/train/n_updates             | 330          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.4         |
|    agent/rollout/ep_rew_wrapped_mean | -67.9        |
|    agent/time/fps                    | 4472         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 71680        |
|    agent/train/approx_kl             | 0.0034612068 |
|    agent/train/clip_fraction         | 0.202        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.555       |
|    agent/train/explained_variance    | 0.974        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.017       |
|    agent/train/n_updates             | 340          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 21.5        |
|    agent/rollout/ep_rew_wrapped_mean | -70.1       |
|    agent/time/fps                    | 4479        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 73728       |
|    agent/train/approx_kl             | 0.004225585 |
|    agent/train/clip_fraction         | 0.172       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.529      |
|    agent/train/explained_variance    | 0.976       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0114     |
|    agent/train/n_updates             | 350         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.1         |
|    agent/rollout/ep_rew_wrapped_mean | -72.7        |
|    agent/time/fps                    | 3887         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 75776        |
|    agent/train/approx_kl             | 0.0050476845 |
|    agent/train/clip_fraction         | 0.173        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.498       |
|    agent/train/explained_variance    | 0.983        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0313      |
|    agent/train/n_updates             | 360          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 21          |
|    agent/rollout/ep_rew_wrapped_mean | -75.3       |
|    agent/time/fps                    | 4416        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 77824       |
|    agent/train/approx_kl             | 0.005521535 |
|    agent/train/clip_fraction         | 0.188       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.485      |
|    agent/train/explained_variance    | 0.987       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0252     |
|    agent/train/n_updates             | 370         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.6         |
|    agent/rollout/ep_rew_wrapped_mean | -72.4        |
|    agent/time/fps                    | 4501         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 79872        |
|    agent/train/approx_kl             | 0.0055752946 |
|    agent/train/clip_fraction         | 0.183        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.435       |
|    agent/train/explained_variance    | 0.988        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0113      |
|    agent/train/n_updates             | 380          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 20.4       |
|    agent/rollout/ep_rew_wrapped_mean | -69.5      |
|    agent/time/fps                    | 4398       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 81920      |
|    agent/train/approx_kl             | 0.00543709 |
|    agent/train/clip_fraction         | 0.161      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.418     |
|    agent/train/explained_variance    | 0.987      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.00621    |
|    agent/train/n_updates             | 390        |
|    agent/train/policy_gradient_loss  | -0.00625 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.9         |
|    agent/rollout/ep_rew_wrapped_mean | -67.4        |
|    agent/time/fps                    | 4507         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 83968        |
|    agent/train/approx_kl             | 0.0035962332 |
|    agent/train/clip_fraction         | 0.171        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.425       |
|    agent/train/explained_variance    | 0.99         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0128      |
|    agent/train/n_updates             | 400          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.3         |
|    agent/rollout/ep_rew_wrapped_mean | -64.8        |
|    agent/time/fps                    | 4442         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 86016        |
|    agent/train/approx_kl             | 0.0034955668 |
|    agent/train/clip_fraction         | 0.16         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.429       |
|    agent/train/explained_variance    | 0.876        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0452       |
|    agent/train/n_updates             | 410          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.5         |
|    agent/rollout/ep_rew_wrapped_mean | -62.5        |
|    agent/time/fps                    | 4499         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 88064        |
|    agent/train/approx_kl             | 0.0034590173 |
|    agent/train/clip_fraction         | 0.133        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.364       |
|    agent/train/explained_variance    | 0.986        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00609     |
|    agent/train/n_updates             | 420          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 22.9       |
|    agent/rollout/ep_rew_wrapped_mean | -60.1      |
|    agent/time/fps                    | 4480       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 90112      |
|    agent/train/approx_kl             | 0.00548972 |
|    agent/train/clip_fraction         | 0.107      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.337     |
|    agent/train/explained_variance    | 0.993      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.01      |
|    agent/train/n_updates             | 430        |
|    agent/train/policy_gradient_loss  | -0.00336 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.1         |
|    agent/rollout/ep_rew_wrapped_mean | -57.7        |
|    agent/time/fps                    | 4496         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 92160        |
|    agent/train/approx_kl             | 0.0030133808 |
|    agent/train/clip_fraction         | 0.118        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.324       |
|    agent/train/explained_variance    | 0.995        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00181     |
|    agent/train/n_updates             | 440          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.4         |
|    agent/rollout/ep_rew_wrapped_mean | -55.4        |
|    agent/time/fps                    | 4456         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 94208        |
|    agent/train/approx_kl             | 0.0028135106 |
|    agent/train/clip_fraction         | 0.13         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.325       |
|    agent/train/explained_variance    | 0.995        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00635     |
|    agent/train/n_updates             | 450          |
|    agent/train



VBox(children=(Label(value='0.113 MB of 0.113 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▄▄▄▃▃▃▄▃▄▄▅█
time/fps,█▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/approx_kl,▂▃▄▃▂▄▁▁▃▃▃▃▄▂▅▂▂█▂▂▅▇▂▃▄▇▅▅▃▃▃▅▄▄█▄▇▆▄▄
train/clip_fraction,▁█▄▃▂▃▂▂▂▄▃▂▂▂▃▂▂▄▂▂▄▅▃▄▄▃▄▄▃▃▃▃▃▂▃▂▃▃▃▅
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▁▂▄▄▆▆▇▇▅▆▇▇▇▆██▆██▄▄▅▅▆▆▆▆▅▇▇▇▆█▇█▆▆▅▄
train/explained_variance,▁██████████████████████▇████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,256.64001
time/fps,2896.0
train/approx_kl,0.00474
train/clip_fraction,0.22515
train/clip_range,0.1
train/entropy_loss,-0.47737
train/explained_variance,0.99524
train/learning_rate,0.002


 20%|████████▊                                   | 1/5 [04:08<16:35, 248.91s/it]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011228332878090441, max=1.0…

Query schedule: [5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Collecting 10 fragments (2000 transitions)
Requested 1200 transitions but only 0 in buffer. Sampling 1200 additional transitions.
Sampling 800 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 5 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 26       |
|    agent/rollout/ep_rew_wrapped_mean | 300      |
|    agent/time/fps                    | 4490     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
----------------------------------------------------
| mean/                                |           |
|    agent/rollout/ep_len_mean         | 500       |
|    agent/rollout/ep_rew_mean         | 26        |
|    agent/rollout/ep_rew_wrapped_mean | 300       |
|    agent/time/fps                    | 4.49e+03  |
|    agent/time/iterations             | 1         |
|    agent/time/time_elapsed           | 0         |
|    agent/time/total_ti

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.1         |
|    agent/rollout/ep_rew_wrapped_mean | 217          |
|    agent/time/fps                    | 4391         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0010350666 |
|    agent/train/clip_fraction         | 0.0313       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.692       |
|    agent/train/explained_variance    | 0.14         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.033        |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.9         |
|    agent/rollout/ep_rew_wrapped_mean | 176          |
|    agent/time/fps                    | 3668         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0022327283 |
|    agent/train/clip_fraction         | 0.0723       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.692       |
|    agent/train/explained_variance    | 0.759        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0336       |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.4         |
|    agent/rollout/ep_rew_wrapped_mean | 159          |
|    agent/time/fps                    | 4425         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0022037518 |
|    agent/train/clip_fraction         | 0.0534       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.689       |
|    agent/train/explained_variance    | 0.736        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00781      |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.6         |
|    agent/rollout/ep_rew_wrapped_mean | 140          |
|    agent/time/fps                    | 4450         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0010298877 |
|    agent/train/clip_fraction         | 0.016        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.689       |
|    agent/train/explained_variance    | 0.263        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00563      |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 30.1        |
|    agent/rollout/ep_rew_wrapped_mean | 127         |
|    agent/time/fps                    | 4452        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 12288       |
|    agent/train/approx_kl             | 0.002871869 |
|    agent/train/clip_fraction         | 0.176       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.687      |
|    agent/train/explained_variance    | 0.0731      |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00418     |
|    agent/train/n_updates             | 50          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29          |
|    agent/rollout/ep_rew_wrapped_mean | 117         |
|    agent/time/fps                    | 4458        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 14336       |
|    agent/train/approx_kl             | 0.003115974 |
|    agent/train/clip_fraction         | 0.195       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.679      |
|    agent/train/explained_variance    | 0.772       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.000857   |
|    agent/train/n_updates             | 60          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.2         |
|    agent/rollout/ep_rew_wrapped_mean | 108          |
|    agent/time/fps                    | 4420         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0032517535 |
|    agent/train/clip_fraction         | 0.117        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.681       |
|    agent/train/explained_variance    | 0.631        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00685     |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.8         |
|    agent/rollout/ep_rew_wrapped_mean | 103          |
|    agent/time/fps                    | 4428         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0036520653 |
|    agent/train/clip_fraction         | 0.239        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.671       |
|    agent/train/explained_variance    | 0.848        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0314      |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.3         |
|    agent/rollout/ep_rew_wrapped_mean | 98.5         |
|    agent/time/fps                    | 4480         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0043368605 |
|    agent/train/clip_fraction         | 0.332        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.661       |
|    agent/train/explained_variance    | 0.851        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.034       |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.5         |
|    agent/rollout/ep_rew_wrapped_mean | 95.5         |
|    agent/time/fps                    | 4323         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0050520613 |
|    agent/train/clip_fraction         | 0.328        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.659       |
|    agent/train/explained_variance    | 0.879        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0169      |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.8         |
|    agent/rollout/ep_rew_wrapped_mean | 93.2         |
|    agent/time/fps                    | 4260         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0054159663 |
|    agent/train/clip_fraction         | 0.368        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.64        |
|    agent/train/explained_variance    | 0.868        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0161      |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.7         |
|    agent/rollout/ep_rew_wrapped_mean | 91.3         |
|    agent/time/fps                    | 4329         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 26624        |
|    agent/train/approx_kl             | 0.0057954593 |
|    agent/train/clip_fraction         | 0.352        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.606       |
|    agent/train/explained_variance    | 0.887        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0161      |
|    agent/train/n_updates             | 120          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 24.1        |
|    agent/rollout/ep_rew_wrapped_mean | 89.9        |
|    agent/time/fps                    | 4482        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 28672       |
|    agent/train/approx_kl             | 0.008327613 |
|    agent/train/clip_fraction         | 0.334       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.56       |
|    agent/train/explained_variance    | 0.846       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0272     |
|    agent/train/n_updates             | 130         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 24          |
|    agent/rollout/ep_rew_wrapped_mean | 88.9        |
|    agent/time/fps                    | 4436        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 30720       |
|    agent/train/approx_kl             | 0.010090597 |
|    agent/train/clip_fraction         | 0.287       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.503      |
|    agent/train/explained_variance    | 0.875       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0205     |
|    agent/train/n_updates             | 140         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 23.5        |
|    agent/rollout/ep_rew_wrapped_mean | 88.1        |
|    agent/time/fps                    | 4442        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 32768       |
|    agent/train/approx_kl             | 0.012668621 |
|    agent/train/clip_fraction         | 0.311       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.447      |
|    agent/train/explained_variance    | 0.845       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0163     |
|    agent/train/n_updates             | 150         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 22.9        |
|    agent/rollout/ep_rew_wrapped_mean | 87.3        |
|    agent/time/fps                    | 4480        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 34816       |
|    agent/train/approx_kl             | 0.010653067 |
|    agent/train/clip_fraction         | 0.233       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.397      |
|    agent/train/explained_variance    | 0.91        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0197     |
|    agent/train/n_updates             | 160         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.4         |
|    agent/rollout/ep_rew_wrapped_mean | 86.6         |
|    agent/time/fps                    | 4264         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 36864        |
|    agent/train/approx_kl             | 0.0060292897 |
|    agent/train/clip_fraction         | 0.223        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.419       |
|    agent/train/explained_variance    | 0.921        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0279      |
|    agent/train/n_updates             | 170          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.8         |
|    agent/rollout/ep_rew_wrapped_mean | 86           |
|    agent/time/fps                    | 4496         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 38912        |
|    agent/train/approx_kl             | 0.0057361485 |
|    agent/train/clip_fraction         | 0.159        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.447       |
|    agent/train/explained_variance    | 0.704        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0169      |
|    agent/train/n_updates             | 180          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.4         |
|    agent/rollout/ep_rew_wrapped_mean | 85.5         |
|    agent/time/fps                    | 4397         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 40960        |
|    agent/train/approx_kl             | 0.0050224797 |
|    agent/train/clip_fraction         | 0.222        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.496       |
|    agent/train/explained_variance    | 0.659        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0122      |
|    agent/train/n_updates             | 190          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.1         |
|    agent/rollout/ep_rew_wrapped_mean | 74.5         |
|    agent/time/fps                    | 4438         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0042931056 |
|    agent/train/clip_fraction         | 0.209        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.481       |
|    agent/train/explained_variance    | 0.77         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0186      |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.6         |
|    agent/rollout/ep_rew_wrapped_mean | 71.3         |
|    agent/time/fps                    | 4320         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 45056        |
|    agent/train/approx_kl             | 0.0042812726 |
|    agent/train/clip_fraction         | 0.178        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.508       |
|    agent/train/explained_variance    | 0.581        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00153      |
|    agent/train/n_updates             | 210          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 20.2        |
|    agent/rollout/ep_rew_wrapped_mean | 69.9        |
|    agent/time/fps                    | 4389        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 47104       |
|    agent/train/approx_kl             | 0.003918398 |
|    agent/train/clip_fraction         | 0.193       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.529      |
|    agent/train/explained_variance    | 0.389       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00667     |
|    agent/train/n_updates             | 220         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19.8         |
|    agent/rollout/ep_rew_wrapped_mean | 67.6         |
|    agent/time/fps                    | 3852         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 49152        |
|    agent/train/approx_kl             | 0.0061434414 |
|    agent/train/clip_fraction         | 0.279        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.505       |
|    agent/train/explained_variance    | 0.918        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0283      |
|    agent/train/n_updates             | 230          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19.6         |
|    agent/rollout/ep_rew_wrapped_mean | 67.9         |
|    agent/time/fps                    | 4253         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 51200        |
|    agent/train/approx_kl             | 0.0065953927 |
|    agent/train/clip_fraction         | 0.233        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.516       |
|    agent/train/explained_variance    | 0.853        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0419       |
|    agent/train/n_updates             | 240          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19           |
|    agent/rollout/ep_rew_wrapped_mean | 68.4         |
|    agent/time/fps                    | 4469         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 53248        |
|    agent/train/approx_kl             | 0.0067332024 |
|    agent/train/clip_fraction         | 0.206        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.488       |
|    agent/train/explained_variance    | 0.882        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0113      |
|    agent/train/n_updates             | 250          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 18          |
|    agent/rollout/ep_rew_wrapped_mean | 69.3        |
|    agent/time/fps                    | 4412        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 55296       |
|    agent/train/approx_kl             | 0.011546129 |
|    agent/train/clip_fraction         | 0.227       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.483      |
|    agent/train/explained_variance    | 0.928       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00518    |
|    agent/train/n_updates             | 260         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17.3         |
|    agent/rollout/ep_rew_wrapped_mean | 70.9         |
|    agent/time/fps                    | 4428         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 57344        |
|    agent/train/approx_kl             | 0.0047770045 |
|    agent/train/clip_fraction         | 0.178        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.501       |
|    agent/train/explained_variance    | 0.891        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00329      |
|    agent/train/n_updates             | 270          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.8         |
|    agent/rollout/ep_rew_wrapped_mean | 72.4         |
|    agent/time/fps                    | 4414         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 59392        |
|    agent/train/approx_kl             | 0.0044840183 |
|    agent/train/clip_fraction         | 0.179        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.469       |
|    agent/train/explained_variance    | 0.95         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00614     |
|    agent/train/n_updates             | 280          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.9         |
|    agent/rollout/ep_rew_wrapped_mean | 74.1         |
|    agent/time/fps                    | 4503         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 61440        |
|    agent/train/approx_kl             | 0.0062575387 |
|    agent/train/clip_fraction         | 0.165        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.46        |
|    agent/train/explained_variance    | 0.841        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.000996    |
|    agent/train/n_updates             | 290          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 15          |
|    agent/rollout/ep_rew_wrapped_mean | 75.7        |
|    agent/time/fps                    | 4287        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 63488       |
|    agent/train/approx_kl             | 0.004387371 |
|    agent/train/clip_fraction         | 0.154       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.447      |
|    agent/train/explained_variance    | 0.936       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0494      |
|    agent/train/n_updates             | 300         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.5         |
|    agent/rollout/ep_rew_wrapped_mean | 76.9         |
|    agent/time/fps                    | 4258         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 65536        |
|    agent/train/approx_kl             | 0.0046000034 |
|    agent/train/clip_fraction         | 0.154        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.442       |
|    agent/train/explained_variance    | 0.848        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0404       |
|    agent/train/n_updates             | 310          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14           |
|    agent/rollout/ep_rew_wrapped_mean | 78.3         |
|    agent/time/fps                    | 4481         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 67584        |
|    agent/train/approx_kl             | 0.0052611623 |
|    agent/train/clip_fraction         | 0.147        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.437       |
|    agent/train/explained_variance    | 0.953        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00397      |
|    agent/train/n_updates             | 320          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 13.8        |
|    agent/rollout/ep_rew_wrapped_mean | 77.5        |
|    agent/time/fps                    | 4491        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 69632       |
|    agent/train/approx_kl             | 0.010898133 |
|    agent/train/clip_fraction         | 0.224       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.456      |
|    agent/train/explained_variance    | -0.87       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0302     |
|    agent/train/n_updates             | 330         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 13.4        |
|    agent/rollout/ep_rew_wrapped_mean | 77.2        |
|    agent/time/fps                    | 4471        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 71680       |
|    agent/train/approx_kl             | 0.005271718 |
|    agent/train/clip_fraction         | 0.18        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.446      |
|    agent/train/explained_variance    | 0.447       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0143     |
|    agent/train/n_updates             | 340         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 13           |
|    agent/rollout/ep_rew_wrapped_mean | 77.4         |
|    agent/time/fps                    | 4260         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 73728        |
|    agent/train/approx_kl             | 0.0051928414 |
|    agent/train/clip_fraction         | 0.202        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.44        |
|    agent/train/explained_variance    | 0.857        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0434       |
|    agent/train/n_updates             | 350          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 12.8         |
|    agent/rollout/ep_rew_wrapped_mean | 77.7         |
|    agent/time/fps                    | 4484         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 75776        |
|    agent/train/approx_kl             | 0.0069878576 |
|    agent/train/clip_fraction         | 0.212        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.432       |
|    agent/train/explained_variance    | 0.915        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0136      |
|    agent/train/n_updates             | 360          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 12.2        |
|    agent/rollout/ep_rew_wrapped_mean | 78          |
|    agent/time/fps                    | 4398        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 77824       |
|    agent/train/approx_kl             | 0.005974079 |
|    agent/train/clip_fraction         | 0.174       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.399      |
|    agent/train/explained_variance    | 0.961       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00235     |
|    agent/train/n_updates             | 370         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 11.9        |
|    agent/rollout/ep_rew_wrapped_mean | 78.6        |
|    agent/time/fps                    | 4448        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 79872       |
|    agent/train/approx_kl             | 0.006224677 |
|    agent/train/clip_fraction         | 0.223       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.397      |
|    agent/train/explained_variance    | 0.975       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0245     |
|    agent/train/n_updates             | 380         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 11.4         |
|    agent/rollout/ep_rew_wrapped_mean | 79.2         |
|    agent/time/fps                    | 4301         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 81920        |
|    agent/train/approx_kl             | 0.0057996027 |
|    agent/train/clip_fraction         | 0.179        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.387       |
|    agent/train/explained_variance    | 0.98         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00105      |
|    agent/train/n_updates             | 390          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 11.1         |
|    agent/rollout/ep_rew_wrapped_mean | 79.7         |
|    agent/time/fps                    | 4509         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 83968        |
|    agent/train/approx_kl             | 0.0053376267 |
|    agent/train/clip_fraction         | 0.158        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.36        |
|    agent/train/explained_variance    | 0.976        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00693     |
|    agent/train/n_updates             | 400          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 11          |
|    agent/rollout/ep_rew_wrapped_mean | 80.9        |
|    agent/time/fps                    | 4492        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 86016       |
|    agent/train/approx_kl             | 0.004947974 |
|    agent/train/clip_fraction         | 0.164       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.34       |
|    agent/train/explained_variance    | 0.99        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0158     |
|    agent/train/n_updates             | 410         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 10.8        |
|    agent/rollout/ep_rew_wrapped_mean | 81.9        |
|    agent/time/fps                    | 4505        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 88064       |
|    agent/train/approx_kl             | 0.005564766 |
|    agent/train/clip_fraction         | 0.185       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.342      |
|    agent/train/explained_variance    | 0.99        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.000862    |
|    agent/train/n_updates             | 420         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 10.8        |
|    agent/rollout/ep_rew_wrapped_mean | 83.5        |
|    agent/time/fps                    | 4495        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 90112       |
|    agent/train/approx_kl             | 0.004381695 |
|    agent/train/clip_fraction         | 0.169       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.347      |
|    agent/train/explained_variance    | 0.989       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0105     |
|    agent/train/n_updates             | 430         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 10.6         |
|    agent/rollout/ep_rew_wrapped_mean | 84.8         |
|    agent/time/fps                    | 4404         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 92160        |
|    agent/train/approx_kl             | 0.0073605645 |
|    agent/train/clip_fraction         | 0.149        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.32        |
|    agent/train/explained_variance    | 0.992        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.000549    |
|    agent/train/n_updates             | 440          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 10.5         |
|    agent/rollout/ep_rew_wrapped_mean | 85.8         |
|    agent/time/fps                    | 4459         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 94208        |
|    agent/train/approx_kl             | 0.0053770253 |
|    agent/train/clip_fraction         | 0.162        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.317       |
|    agent/train/explained_variance    | 0.995        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00993      |
|    agent/train/n_updates             | 450          |
|    agent/train



VBox(children=(Label(value='0.112 MB of 0.112 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,█▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
time/fps,█▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▁▁▁▁
train/approx_kl,▂▂▃▄▃▄▃▃▂▅▅▂▂▃▂▂▄▂▃▃▄▄▂▄▄▃▂▂▂▂▃▂▅▃▂█▂▂▁▃
train/clip_fraction,█▆▅▆▄▄▃▃▂▂▃▃▂▂▂▂▂▂▂▂▁▂▂▁▂▂▁▂▂▁▁▁▂▁▂▂▂▂▁▁
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▃▄▅▆▆▇▇▇▇▇▇▇▇███▇████████▇▇███████▇███
train/explained_variance,▁███████████████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,9.79
time/fps,2873.0
train/approx_kl,0.00484
train/clip_fraction,0.05625
train/clip_range,0.1
train/entropy_loss,-0.07784
train/explained_variance,0.99894
train/learning_rate,0.002


 40%|█████████████████▌                          | 2/5 [08:20<12:31, 250.34s/it]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011168548612234493, max=1.0…

Query schedule: [5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Collecting 10 fragments (2000 transitions)
Requested 1200 transitions but only 0 in buffer. Sampling 1200 additional transitions.
Sampling 800 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 5 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 27.5     |
|    agent/rollout/ep_rew_wrapped_mean | -13.8    |
|    agent/time/fps                    | 4270     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 27.5     |
|    agent/rollout/ep_rew_wrapped_mean | -13.8    |
|    agent/time/fps                    | 4.27e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 22.8       |
|    agent/rollout/ep_rew_wrapped_mean | -0.241     |
|    agent/time/fps                    | 4308       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 4096       |
|    agent/train/approx_kl             | 0.00425779 |
|    agent/train/clip_fraction         | 0.169      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.69      |
|    agent/train/explained_variance    | 0.644      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.0172    |
|    agent/train/n_updates             | 10         |
|    agent/train/policy_gradient_loss  | -0.0064  

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 24.3        |
|    agent/rollout/ep_rew_wrapped_mean | 14.9        |
|    agent/time/fps                    | 4406        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 6144        |
|    agent/train/approx_kl             | 0.002429678 |
|    agent/train/clip_fraction         | 0.109       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.684      |
|    agent/train/explained_variance    | 0.249       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0737      |
|    agent/train/n_updates             | 20          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.2         |
|    agent/rollout/ep_rew_wrapped_mean | 11.7         |
|    agent/time/fps                    | 4337         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0023358082 |
|    agent/train/clip_fraction         | 0.0927       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.679       |
|    agent/train/explained_variance    | 0.82         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0643       |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 24.6        |
|    agent/rollout/ep_rew_wrapped_mean | -0.0766     |
|    agent/time/fps                    | 4374        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 10240       |
|    agent/train/approx_kl             | 0.004803277 |
|    agent/train/clip_fraction         | 0.346       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.674      |
|    agent/train/explained_variance    | 0.623       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.018       |
|    agent/train/n_updates             | 40          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.5         |
|    agent/rollout/ep_rew_wrapped_mean | 0.636        |
|    agent/time/fps                    | 4417         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0032706035 |
|    agent/train/clip_fraction         | 0.132        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.659       |
|    agent/train/explained_variance    | 0.938        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0181       |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.6         |
|    agent/rollout/ep_rew_wrapped_mean | 3.54         |
|    agent/time/fps                    | 4254         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0027039903 |
|    agent/train/clip_fraction         | 0.146        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.645       |
|    agent/train/explained_variance    | 0.955        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00784     |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19.4         |
|    agent/rollout/ep_rew_wrapped_mean | 1.67         |
|    agent/time/fps                    | 4451         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0027815811 |
|    agent/train/clip_fraction         | 0.102        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.638       |
|    agent/train/explained_variance    | 0.867        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0139       |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.4         |
|    agent/rollout/ep_rew_wrapped_mean | -1.8         |
|    agent/time/fps                    | 4449         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0037117205 |
|    agent/train/clip_fraction         | 0.144        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.629       |
|    agent/train/explained_variance    | 0.615        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0399       |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17.8         |
|    agent/rollout/ep_rew_wrapped_mean | -2.91        |
|    agent/time/fps                    | 4440         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0053708144 |
|    agent/train/clip_fraction         | 0.209        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.614       |
|    agent/train/explained_variance    | 0.907        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0023      |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17.1         |
|    agent/rollout/ep_rew_wrapped_mean | -3.78        |
|    agent/time/fps                    | 4476         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0047179647 |
|    agent/train/clip_fraction         | 0.136        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.602       |
|    agent/train/explained_variance    | 0.859        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00336      |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 16.3        |
|    agent/rollout/ep_rew_wrapped_mean | -4.86       |
|    agent/time/fps                    | 4358        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 24576       |
|    agent/train/approx_kl             | 0.003741837 |
|    agent/train/clip_fraction         | 0.147       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.582      |
|    agent/train/explained_variance    | 0.921       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0146     |
|    agent/train/n_updates             | 110         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.8         |
|    agent/rollout/ep_rew_wrapped_mean | -5.39        |
|    agent/time/fps                    | 4098         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 26624        |
|    agent/train/approx_kl             | 0.0060585877 |
|    agent/train/clip_fraction         | 0.161        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.575       |
|    agent/train/explained_variance    | 0.977        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0239      |
|    agent/train/n_updates             | 120          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.3         |
|    agent/rollout/ep_rew_wrapped_mean | -5.98        |
|    agent/time/fps                    | 4394         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 28672        |
|    agent/train/approx_kl             | 0.0027350103 |
|    agent/train/clip_fraction         | 0.138        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.551       |
|    agent/train/explained_variance    | 0.946        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00934     |
|    agent/train/n_updates             | 130          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.8        |
|    agent/rollout/ep_rew_wrapped_mean | -6.51       |
|    agent/time/fps                    | 3804        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 30720       |
|    agent/train/approx_kl             | 0.003057062 |
|    agent/train/clip_fraction         | 0.123       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.525      |
|    agent/train/explained_variance    | 0.985       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00178     |
|    agent/train/n_updates             | 140         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.4        |
|    agent/rollout/ep_rew_wrapped_mean | -7.02       |
|    agent/time/fps                    | 2603        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 32768       |
|    agent/train/approx_kl             | 0.003704635 |
|    agent/train/clip_fraction         | 0.109       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.516      |
|    agent/train/explained_variance    | 0.965       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0248     |
|    agent/train/n_updates             | 150         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.1        |
|    agent/rollout/ep_rew_wrapped_mean | -7.72       |
|    agent/time/fps                    | 1858        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 34816       |
|    agent/train/approx_kl             | 0.003182559 |
|    agent/train/clip_fraction         | 0.112       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.524      |
|    agent/train/explained_variance    | 0.947       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00657    |
|    agent/train/n_updates             | 160         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 13.8         |
|    agent/rollout/ep_rew_wrapped_mean | -8.06        |
|    agent/time/fps                    | 1433         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 1            |
|    agent/time/total_timesteps        | 36864        |
|    agent/train/approx_kl             | 0.0052942587 |
|    agent/train/clip_fraction         | 0.162        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.498       |
|    agent/train/explained_variance    | 0.966        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0115      |
|    agent/train/n_updates             | 170          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 13.5        |
|    agent/rollout/ep_rew_wrapped_mean | -8.32       |
|    agent/time/fps                    | 3769        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 38912       |
|    agent/train/approx_kl             | 0.005174638 |
|    agent/train/clip_fraction         | 0.157       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.467      |
|    agent/train/explained_variance    | 0.967       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0248     |
|    agent/train/n_updates             | 180         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 13.3        |
|    agent/rollout/ep_rew_wrapped_mean | -8.47       |
|    agent/time/fps                    | 4107        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 40960       |
|    agent/train/approx_kl             | 0.005464806 |
|    agent/train/clip_fraction         | 0.135       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.44       |
|    agent/train/explained_variance    | 0.945       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0158     |
|    agent/train/n_updates             | 190         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 13           |
|    agent/rollout/ep_rew_wrapped_mean | -8.57        |
|    agent/time/fps                    | 3058         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0028119916 |
|    agent/train/clip_fraction         | 0.103        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.41        |
|    agent/train/explained_variance    | 0.982        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00304     |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 12.8        |
|    agent/rollout/ep_rew_wrapped_mean | -10.5       |
|    agent/time/fps                    | 4286        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 45056       |
|    agent/train/approx_kl             | 0.008762778 |
|    agent/train/clip_fraction         | 0.246       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.453      |
|    agent/train/explained_variance    | 0.966       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0181     |
|    agent/train/n_updates             | 210         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 12.6        |
|    agent/rollout/ep_rew_wrapped_mean | -14.2       |
|    agent/time/fps                    | 4180        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 47104       |
|    agent/train/approx_kl             | 0.003751004 |
|    agent/train/clip_fraction         | 0.148       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.457      |
|    agent/train/explained_variance    | 0.979       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0118     |
|    agent/train/n_updates             | 220         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 12.4         |
|    agent/rollout/ep_rew_wrapped_mean | -15.7        |
|    agent/time/fps                    | 4124         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 49152        |
|    agent/train/approx_kl             | 0.0034580966 |
|    agent/train/clip_fraction         | 0.197        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.446       |
|    agent/train/explained_variance    | 0.972        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0321      |
|    agent/train/n_updates             | 230          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 12.3        |
|    agent/rollout/ep_rew_wrapped_mean | -14.8       |
|    agent/time/fps                    | 4232        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 51200       |
|    agent/train/approx_kl             | 0.002950305 |
|    agent/train/clip_fraction         | 0.151       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.446      |
|    agent/train/explained_variance    | 0.982       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.000803   |
|    agent/train/n_updates             | 240         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 11.5         |
|    agent/rollout/ep_rew_wrapped_mean | -16.4        |
|    agent/time/fps                    | 4144         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 53248        |
|    agent/train/approx_kl             | 0.0037074282 |
|    agent/train/clip_fraction         | 0.163        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.425       |
|    agent/train/explained_variance    | 0.958        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00442     |
|    agent/train/n_updates             | 250          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 11.1        |
|    agent/rollout/ep_rew_wrapped_mean | -18.8       |
|    agent/time/fps                    | 4419        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 55296       |
|    agent/train/approx_kl             | 0.004772613 |
|    agent/train/clip_fraction         | 0.142       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.374      |
|    agent/train/explained_variance    | 0.949       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0215     |
|    agent/train/n_updates             | 260         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 10.4        |
|    agent/rollout/ep_rew_wrapped_mean | -19.6       |
|    agent/time/fps                    | 3424        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 57344       |
|    agent/train/approx_kl             | 0.005507822 |
|    agent/train/clip_fraction         | 0.123       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.309      |
|    agent/train/explained_variance    | 0.805       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00839    |
|    agent/train/n_updates             | 270         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 9.27         |
|    agent/rollout/ep_rew_wrapped_mean | -19.7        |
|    agent/time/fps                    | 4199         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 59392        |
|    agent/train/approx_kl             | 0.0029059905 |
|    agent/train/clip_fraction         | 0.111        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.287       |
|    agent/train/explained_variance    | 0.782        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00104     |
|    agent/train/n_updates             | 280          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 9.05        |
|    agent/rollout/ep_rew_wrapped_mean | -20.7       |
|    agent/time/fps                    | 1857        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 61440       |
|    agent/train/approx_kl             | 0.011210933 |
|    agent/train/clip_fraction         | 0.151       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.321      |
|    agent/train/explained_variance    | 0.833       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.033      |
|    agent/train/n_updates             | 290         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.91        |
|    agent/rollout/ep_rew_wrapped_mean | -21.6       |
|    agent/time/fps                    | 806         |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 2           |
|    agent/time/total_timesteps        | 63488       |
|    agent/train/approx_kl             | 0.010745978 |
|    agent/train/clip_fraction         | 0.216       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.339      |
|    agent/train/explained_variance    | 0.944       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0293     |
|    agent/train/n_updates             | 300         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.88        |
|    agent/rollout/ep_rew_wrapped_mean | -23.1       |
|    agent/time/fps                    | 2418        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 65536       |
|    agent/train/approx_kl             | 0.009755615 |
|    agent/train/clip_fraction         | 0.153       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.374      |
|    agent/train/explained_variance    | 0.358       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00571    |
|    agent/train/n_updates             | 310         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.77        |
|    agent/rollout/ep_rew_wrapped_mean | -24.9       |
|    agent/time/fps                    | 1774        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 67584       |
|    agent/train/approx_kl             | 0.004697615 |
|    agent/train/clip_fraction         | 0.161       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.387      |
|    agent/train/explained_variance    | 0.738       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0116     |
|    agent/train/n_updates             | 320         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.68        |
|    agent/rollout/ep_rew_wrapped_mean | -26.4       |
|    agent/time/fps                    | 3268        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 69632       |
|    agent/train/approx_kl             | 0.009273861 |
|    agent/train/clip_fraction         | 0.17        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.355      |
|    agent/train/explained_variance    | 0.828       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00372     |
|    agent/train/n_updates             | 330         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.49        |
|    agent/rollout/ep_rew_wrapped_mean | -27.8       |
|    agent/time/fps                    | 3752        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 71680       |
|    agent/train/approx_kl             | 0.009630324 |
|    agent/train/clip_fraction         | 0.189       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.328      |
|    agent/train/explained_variance    | 0.888       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0214     |
|    agent/train/n_updates             | 340         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.4         |
|    agent/rollout/ep_rew_wrapped_mean | -29.2       |
|    agent/time/fps                    | 3827        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 73728       |
|    agent/train/approx_kl             | 0.007059751 |
|    agent/train/clip_fraction         | 0.18        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.323      |
|    agent/train/explained_variance    | 0.915       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0112     |
|    agent/train/n_updates             | 350         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 8.41       |
|    agent/rollout/ep_rew_wrapped_mean | -30.5      |
|    agent/time/fps                    | 3413       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 75776      |
|    agent/train/approx_kl             | 0.00528975 |
|    agent/train/clip_fraction         | 0.153      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.321     |
|    agent/train/explained_variance    | 0.931      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.000508  |
|    agent/train/n_updates             | 360        |
|    agent/train/policy_gradient_loss  | -0.00547 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.4         |
|    agent/rollout/ep_rew_wrapped_mean | -32.2       |
|    agent/time/fps                    | 4187        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 77824       |
|    agent/train/approx_kl             | 0.005443808 |
|    agent/train/clip_fraction         | 0.109       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.299      |
|    agent/train/explained_variance    | 0.892       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00083    |
|    agent/train/n_updates             | 370         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.41        |
|    agent/rollout/ep_rew_wrapped_mean | -34.3       |
|    agent/time/fps                    | 3885        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 79872       |
|    agent/train/approx_kl             | 0.006152567 |
|    agent/train/clip_fraction         | 0.123       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.298      |
|    agent/train/explained_variance    | 0.937       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.023      |
|    agent/train/n_updates             | 380         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.49        |
|    agent/rollout/ep_rew_wrapped_mean | -36.5       |
|    agent/time/fps                    | 4129        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 81920       |
|    agent/train/approx_kl             | 0.008692031 |
|    agent/train/clip_fraction         | 0.135       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.307      |
|    agent/train/explained_variance    | 0.974       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0103     |
|    agent/train/n_updates             | 390         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.64         |
|    agent/rollout/ep_rew_wrapped_mean | -38.3        |
|    agent/time/fps                    | 3432         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 83968        |
|    agent/train/approx_kl             | 0.0070639215 |
|    agent/train/clip_fraction         | 0.22         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.322       |
|    agent/train/explained_variance    | 0.81         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00957     |
|    agent/train/n_updates             | 400          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.67         |
|    agent/rollout/ep_rew_wrapped_mean | -39.9        |
|    agent/time/fps                    | 4108         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 86016        |
|    agent/train/approx_kl             | 0.0070760194 |
|    agent/train/clip_fraction         | 0.169        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.308       |
|    agent/train/explained_variance    | 0.802        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0211      |
|    agent/train/n_updates             | 410          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.74         |
|    agent/rollout/ep_rew_wrapped_mean | -41.3        |
|    agent/time/fps                    | 4154         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 88064        |
|    agent/train/approx_kl             | 0.0061310134 |
|    agent/train/clip_fraction         | 0.204        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.317       |
|    agent/train/explained_variance    | 0.911        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0165      |
|    agent/train/n_updates             | 420          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 9.14        |
|    agent/rollout/ep_rew_wrapped_mean | -42.8       |
|    agent/time/fps                    | 4118        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 90112       |
|    agent/train/approx_kl             | 0.010121426 |
|    agent/train/clip_fraction         | 0.224       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.33       |
|    agent/train/explained_variance    | 0.919       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0287     |
|    agent/train/n_updates             | 430         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 9.52         |
|    agent/rollout/ep_rew_wrapped_mean | -44          |
|    agent/time/fps                    | 3640         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 92160        |
|    agent/train/approx_kl             | 0.0031309237 |
|    agent/train/clip_fraction         | 0.163        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.399       |
|    agent/train/explained_variance    | 0.902        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.000917     |
|    agent/train/n_updates             | 440          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 9.85        |
|    agent/rollout/ep_rew_wrapped_mean | -45.4       |
|    agent/time/fps                    | 1746        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 94208       |
|    agent/train/approx_kl             | 0.004548991 |
|    agent/train/clip_fraction         | 0.167       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.428      |
|    agent/train/explained_variance    | 0.869       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0182     |
|    agent/train/n_updates             | 450         |
|    agent/train/policy_gradient_



VBox(children=(Label(value='0.161 MB of 0.161 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▃▄▄▅▆▇██▇▆
time/fps,▁▁▃▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████████
train/approx_kl,▂▂▂▂▂▁▁▁▅▂▂▃▄▂▁▂▅▃▃▄▂▂▂▂▁▂▁▁▃▃▆▂▇▄▃▃▃▆▃█
train/clip_fraction,▂▃▄▄▂▂▃▂▂▂▂▁▂▁▁▁▂▂▁▂▁▂▁▂▁▁▁▁▇▄▇█▇▇▆▆▇▇▆▆
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▄▅▆▆▆▇▇▇▇█████▇██▇█▇█████▇▄▅▃▃▃▃▃▃▄▃▃▃
train/explained_variance,▁▇▇▇█████████████████████████████▇▇▇▇█▇█
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,288.89999
time/fps,2759.0
train/approx_kl,0.01477
train/clip_fraction,0.28618
train/clip_range,0.1
train/entropy_loss,-0.4945
train/explained_variance,0.98271
train/learning_rate,0.002


 60%|██████████████████████████▍                 | 3/5 [12:57<08:45, 262.57s/it]

Query schedule: [5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Collecting 10 fragments (2000 transitions)
Requested 1200 transitions but only 0 in buffer. Sampling 1200 additional transitions.
Sampling 800 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 5 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 37.8     |
|    agent/rollout/ep_rew_wrapped_mean | 363      |
|    agent/time/fps                    | 4325     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 37.8     |
|    agent/rollout/ep_rew_wrapped_mean | 363      |
|    agent/time/fps                    | 4.32e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 45.6         |
|    agent/rollout/ep_rew_wrapped_mean | 160          |
|    agent/time/fps                    | 4042         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0017781317 |
|    agent/train/clip_fraction         | 0.0471       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.692       |
|    agent/train/explained_variance    | -0.488       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0123       |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 40.5         |
|    agent/rollout/ep_rew_wrapped_mean | 96.4         |
|    agent/time/fps                    | 4239         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0024447432 |
|    agent/train/clip_fraction         | 0.0754       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.692       |
|    agent/train/explained_variance    | 0.65         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00172      |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 36.9         |
|    agent/rollout/ep_rew_wrapped_mean | 62.6         |
|    agent/time/fps                    | 4356         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0019687198 |
|    agent/train/clip_fraction         | 0.0928       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.691       |
|    agent/train/explained_variance    | 0.756        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0103      |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 32.2        |
|    agent/rollout/ep_rew_wrapped_mean | 32.5        |
|    agent/time/fps                    | 4394        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 10240       |
|    agent/train/approx_kl             | 0.003888266 |
|    agent/train/clip_fraction         | 0.175       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.679      |
|    agent/train/explained_variance    | -1.32       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0478      |
|    agent/train/n_updates             | 40          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.2         |
|    agent/rollout/ep_rew_wrapped_mean | 8.66         |
|    agent/time/fps                    | 4410         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0013477943 |
|    agent/train/clip_fraction         | 0.0696       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.674       |
|    agent/train/explained_variance    | 0.649        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.116        |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 27.5        |
|    agent/rollout/ep_rew_wrapped_mean | -7.74       |
|    agent/time/fps                    | 4398        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 14336       |
|    agent/train/approx_kl             | 0.002458446 |
|    agent/train/clip_fraction         | 0.104       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.667      |
|    agent/train/explained_variance    | 0.919       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0138      |
|    agent/train/n_updates             | 60          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 25.5        |
|    agent/rollout/ep_rew_wrapped_mean | -32.3       |
|    agent/time/fps                    | 4380        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 16384       |
|    agent/train/approx_kl             | 0.003961237 |
|    agent/train/clip_fraction         | 0.187       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.669      |
|    agent/train/explained_variance    | 0.868       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0947      |
|    agent/train/n_updates             | 70          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.1         |
|    agent/rollout/ep_rew_wrapped_mean | -50.7        |
|    agent/time/fps                    | 4415         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0018379427 |
|    agent/train/clip_fraction         | 0.123        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.676       |
|    agent/train/explained_variance    | 0.93         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0547       |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 24.2        |
|    agent/rollout/ep_rew_wrapped_mean | -59.5       |
|    agent/time/fps                    | 4367        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 20480       |
|    agent/train/approx_kl             | 0.003644924 |
|    agent/train/clip_fraction         | 0.202       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.684      |
|    agent/train/explained_variance    | 0.976       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0167      |
|    agent/train/n_updates             | 90          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.9         |
|    agent/rollout/ep_rew_wrapped_mean | -64.1        |
|    agent/time/fps                    | 4452         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0045476495 |
|    agent/train/clip_fraction         | 0.373        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.685       |
|    agent/train/explained_variance    | 0.994        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0138       |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.4         |
|    agent/rollout/ep_rew_wrapped_mean | -66.3        |
|    agent/time/fps                    | 4383         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0031276396 |
|    agent/train/clip_fraction         | 0.213        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.68        |
|    agent/train/explained_variance    | 0.987        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00925     |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.7         |
|    agent/rollout/ep_rew_wrapped_mean | -66.2        |
|    agent/time/fps                    | 4387         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 26624        |
|    agent/train/approx_kl             | 0.0034162574 |
|    agent/train/clip_fraction         | 0.184        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.681       |
|    agent/train/explained_variance    | 0.969        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.054        |
|    agent/train/n_updates             | 120          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 24.7        |
|    agent/rollout/ep_rew_wrapped_mean | -64.8       |
|    agent/time/fps                    | 4485        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 28672       |
|    agent/train/approx_kl             | 0.004188045 |
|    agent/train/clip_fraction         | 0.194       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.673      |
|    agent/train/explained_variance    | 0.908       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0187      |
|    agent/train/n_updates             | 130         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.6         |
|    agent/rollout/ep_rew_wrapped_mean | -62.7        |
|    agent/time/fps                    | 3781         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0030278177 |
|    agent/train/clip_fraction         | 0.144        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.664       |
|    agent/train/explained_variance    | 0.21         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0148       |
|    agent/train/n_updates             | 140          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.3         |
|    agent/rollout/ep_rew_wrapped_mean | -61.4        |
|    agent/time/fps                    | 4366         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0021853624 |
|    agent/train/clip_fraction         | 0.0946       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.668       |
|    agent/train/explained_variance    | 0.609        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00588      |
|    agent/train/n_updates             | 150          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.5         |
|    agent/rollout/ep_rew_wrapped_mean | -60.1        |
|    agent/time/fps                    | 4403         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0030702723 |
|    agent/train/clip_fraction         | 0.12         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.664       |
|    agent/train/explained_variance    | 0.84         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00469     |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 26.6        |
|    agent/rollout/ep_rew_wrapped_mean | -58.5       |
|    agent/time/fps                    | 4463        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 36864       |
|    agent/train/approx_kl             | 0.004098367 |
|    agent/train/clip_fraction         | 0.229       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.651      |
|    agent/train/explained_variance    | 0.883       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00763     |
|    agent/train/n_updates             | 170         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28           |
|    agent/rollout/ep_rew_wrapped_mean | -57.5        |
|    agent/time/fps                    | 4434         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 38912        |
|    agent/train/approx_kl             | 0.0026738073 |
|    agent/train/clip_fraction         | 0.171        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.652       |
|    agent/train/explained_variance    | 0.857        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00832      |
|    agent/train/n_updates             | 180          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.1         |
|    agent/rollout/ep_rew_wrapped_mean | -56.2        |
|    agent/time/fps                    | 4354         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 40960        |
|    agent/train/approx_kl             | 0.0036824811 |
|    agent/train/clip_fraction         | 0.238        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.646       |
|    agent/train/explained_variance    | 0.916        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00855      |
|    agent/train/n_updates             | 190          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.7         |
|    agent/rollout/ep_rew_wrapped_mean | -77.3        |
|    agent/time/fps                    | 4497         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0042409655 |
|    agent/train/clip_fraction         | 0.189        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.617       |
|    agent/train/explained_variance    | 0.762        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0117       |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28           |
|    agent/rollout/ep_rew_wrapped_mean | -77.6        |
|    agent/time/fps                    | 3973         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 45056        |
|    agent/train/approx_kl             | 0.0032160813 |
|    agent/train/clip_fraction         | 0.201        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.611       |
|    agent/train/explained_variance    | 0.895        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.031        |
|    agent/train/n_updates             | 210          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.7         |
|    agent/rollout/ep_rew_wrapped_mean | -78.7        |
|    agent/time/fps                    | 1512         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 1            |
|    agent/time/total_timesteps        | 47104        |
|    agent/train/approx_kl             | 0.0043643685 |
|    agent/train/clip_fraction         | 0.184        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.585       |
|    agent/train/explained_variance    | 0.966        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0205       |
|    agent/train/n_updates             | 220          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.7         |
|    agent/rollout/ep_rew_wrapped_mean | -79.6        |
|    agent/time/fps                    | 3037         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 49152        |
|    agent/train/approx_kl             | 0.0020792345 |
|    agent/train/clip_fraction         | 0.0873       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.583       |
|    agent/train/explained_variance    | 0.978        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0161       |
|    agent/train/n_updates             | 230          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.5         |
|    agent/rollout/ep_rew_wrapped_mean | -77.2        |
|    agent/time/fps                    | 3899         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 51200        |
|    agent/train/approx_kl             | 0.0024423122 |
|    agent/train/clip_fraction         | 0.115        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.573       |
|    agent/train/explained_variance    | 0.992        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00573     |
|    agent/train/n_updates             | 240          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27           |
|    agent/rollout/ep_rew_wrapped_mean | -73.4        |
|    agent/time/fps                    | 3634         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 53248        |
|    agent/train/approx_kl             | 0.0033995104 |
|    agent/train/clip_fraction         | 0.166        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.574       |
|    agent/train/explained_variance    | 0.996        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0123      |
|    agent/train/n_updates             | 250          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.1         |
|    agent/rollout/ep_rew_wrapped_mean | -69.8        |
|    agent/time/fps                    | 3945         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 55296        |
|    agent/train/approx_kl             | 0.0033908766 |
|    agent/train/clip_fraction         | 0.158        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.558       |
|    agent/train/explained_variance    | 0.996        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00193      |
|    agent/train/n_updates             | 260          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.4         |
|    agent/rollout/ep_rew_wrapped_mean | -61.8        |
|    agent/time/fps                    | 4358         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 57344        |
|    agent/train/approx_kl             | 0.0050362227 |
|    agent/train/clip_fraction         | 0.179        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.546       |
|    agent/train/explained_variance    | 0.998        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00758     |
|    agent/train/n_updates             | 270          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.1         |
|    agent/rollout/ep_rew_wrapped_mean | -51.5        |
|    agent/time/fps                    | 4482         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 59392        |
|    agent/train/approx_kl             | 0.0032642658 |
|    agent/train/clip_fraction         | 0.0902       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.567       |
|    agent/train/explained_variance    | 0.95         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0323       |
|    agent/train/n_updates             | 280          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.3         |
|    agent/rollout/ep_rew_wrapped_mean | -43.6        |
|    agent/time/fps                    | 4324         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 61440        |
|    agent/train/approx_kl             | 0.0020735082 |
|    agent/train/clip_fraction         | 0.0769       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.559       |
|    agent/train/explained_variance    | 0.866        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0198       |
|    agent/train/n_updates             | 290          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.9         |
|    agent/rollout/ep_rew_wrapped_mean | -36.8        |
|    agent/time/fps                    | 4414         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 63488        |
|    agent/train/approx_kl             | 0.0041976385 |
|    agent/train/clip_fraction         | 0.116        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.534       |
|    agent/train/explained_variance    | 0.911        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00375     |
|    agent/train/n_updates             | 300          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 31.5        |
|    agent/rollout/ep_rew_wrapped_mean | -30.9       |
|    agent/time/fps                    | 4055        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 65536       |
|    agent/train/approx_kl             | 0.002610249 |
|    agent/train/clip_fraction         | 0.13        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.533      |
|    agent/train/explained_variance    | 0.935       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0143      |
|    agent/train/n_updates             | 310         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33           |
|    agent/rollout/ep_rew_wrapped_mean | -26.9        |
|    agent/time/fps                    | 3993         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 67584        |
|    agent/train/approx_kl             | 0.0018813803 |
|    agent/train/clip_fraction         | 0.132        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.557       |
|    agent/train/explained_variance    | 0.91         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00781      |
|    agent/train/n_updates             | 320          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.2         |
|    agent/rollout/ep_rew_wrapped_mean | -23.1        |
|    agent/time/fps                    | 4398         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 69632        |
|    agent/train/approx_kl             | 0.0053755576 |
|    agent/train/clip_fraction         | 0.233        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.544       |
|    agent/train/explained_variance    | 0.953        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00676     |
|    agent/train/n_updates             | 330          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 33.9        |
|    agent/rollout/ep_rew_wrapped_mean | -20.4       |
|    agent/time/fps                    | 4135        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 71680       |
|    agent/train/approx_kl             | 0.002870842 |
|    agent/train/clip_fraction         | 0.167       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.528      |
|    agent/train/explained_variance    | 0.92        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0125     |
|    agent/train/n_updates             | 340         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 34.5        |
|    agent/rollout/ep_rew_wrapped_mean | -17.6       |
|    agent/time/fps                    | 4472        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 73728       |
|    agent/train/approx_kl             | 0.009360782 |
|    agent/train/clip_fraction         | 0.237       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.564      |
|    agent/train/explained_variance    | 0.956       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00106    |
|    agent/train/n_updates             | 350         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 35.7        |
|    agent/rollout/ep_rew_wrapped_mean | -15.2       |
|    agent/time/fps                    | 4522        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 75776       |
|    agent/train/approx_kl             | 0.004909952 |
|    agent/train/clip_fraction         | 0.205       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.58       |
|    agent/train/explained_variance    | 0.904       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00754    |
|    agent/train/n_updates             | 360         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 37.6         |
|    agent/rollout/ep_rew_wrapped_mean | -13.3        |
|    agent/time/fps                    | 3992         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 77824        |
|    agent/train/approx_kl             | 0.0035855165 |
|    agent/train/clip_fraction         | 0.144        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.555       |
|    agent/train/explained_variance    | 0.93         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0113      |
|    agent/train/n_updates             | 370          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 40.5         |
|    agent/rollout/ep_rew_wrapped_mean | -10.9        |
|    agent/time/fps                    | 3900         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 79872        |
|    agent/train/approx_kl             | 0.0042317593 |
|    agent/train/clip_fraction         | 0.217        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.582       |
|    agent/train/explained_variance    | 0.96         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00597     |
|    agent/train/n_updates             | 380          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 44.3         |
|    agent/rollout/ep_rew_wrapped_mean | -9.26        |
|    agent/time/fps                    | 4485         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 81920        |
|    agent/train/approx_kl             | 0.0035232957 |
|    agent/train/clip_fraction         | 0.2          |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.582       |
|    agent/train/explained_variance    | 0.97         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0093      |
|    agent/train/n_updates             | 390          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 48.5         |
|    agent/rollout/ep_rew_wrapped_mean | -5.44        |
|    agent/time/fps                    | 3830         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 83968        |
|    agent/train/approx_kl             | 0.0046233525 |
|    agent/train/clip_fraction         | 0.221        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.581       |
|    agent/train/explained_variance    | 0.949        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00545      |
|    agent/train/n_updates             | 400          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 53.9         |
|    agent/rollout/ep_rew_wrapped_mean | -2.02        |
|    agent/time/fps                    | 4489         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 86016        |
|    agent/train/approx_kl             | 0.0038992944 |
|    agent/train/clip_fraction         | 0.205        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.614       |
|    agent/train/explained_variance    | 0.95         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00338     |
|    agent/train/n_updates             | 410          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 58.9         |
|    agent/rollout/ep_rew_wrapped_mean | 1.08         |
|    agent/time/fps                    | 4505         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 88064        |
|    agent/train/approx_kl             | 0.0036277603 |
|    agent/train/clip_fraction         | 0.244        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.597       |
|    agent/train/explained_variance    | 0.952        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00555     |
|    agent/train/n_updates             | 420          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 63.7         |
|    agent/rollout/ep_rew_wrapped_mean | 4.53         |
|    agent/time/fps                    | 4504         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 90112        |
|    agent/train/approx_kl             | 0.0034358264 |
|    agent/train/clip_fraction         | 0.251        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.61        |
|    agent/train/explained_variance    | 0.943        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0156       |
|    agent/train/n_updates             | 430          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 70.2        |
|    agent/rollout/ep_rew_wrapped_mean | 6.62        |
|    agent/time/fps                    | 4360        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 92160       |
|    agent/train/approx_kl             | 0.005141681 |
|    agent/train/clip_fraction         | 0.25        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.599      |
|    agent/train/explained_variance    | 0.958       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00244     |
|    agent/train/n_updates             | 440         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 80.7        |
|    agent/rollout/ep_rew_wrapped_mean | 9.08        |
|    agent/time/fps                    | 4198        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 94208       |
|    agent/train/approx_kl             | 0.005427778 |
|    agent/train/clip_fraction         | 0.339       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.577      |
|    agent/train/explained_variance    | 0.974       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0284     |
|    agent/train/n_updates             | 450         |
|    agent/train/policy_gradient_



VBox(children=(Label(value='0.112 MB of 0.112 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,█▅▃▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
time/fps,█▃▁▂▁▂▂▂▁▁▁▁▁▁▂▂▂▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
train/approx_kl,▂▃▃▁▁▁▁▂▂▂▂▄▄▂▃▂▁▃▃▂▂▅▃▅▂▃▃▃▃▃▃▃▁▂█▆▂▂▄▂
train/clip_fraction,▅█▄▅▃▃▂▃▃▂▂▃▃▂▂▂▂▃▂▁▁▄▂▃▂▂▂▁▂▂▁▁▁▁▃▂▂▁▁▁
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▄▄▅▆▆▇▇▇▇▇▇▇██▇▇███▇▇▇▇█████████▇█████
train/explained_variance,▁███████████████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,8.45
time/fps,2874.0
train/approx_kl,0.00478
train/clip_fraction,0.06665
train/clip_range,0.1
train/entropy_loss,-0.12767
train/explained_variance,0.99922
train/learning_rate,0.002


 80%|███████████████████████████████████▏        | 4/5 [17:13<04:19, 259.97s/it]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011168246755066018, max=1.0…

Query schedule: [5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Collecting 10 fragments (2000 transitions)
Requested 1200 transitions but only 0 in buffer. Sampling 1200 additional transitions.
Sampling 800 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 5 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 35.8     |
|    agent/rollout/ep_rew_wrapped_mean | 65.5     |
|    agent/time/fps                    | 4434     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 35.8     |
|    agent/rollout/ep_rew_wrapped_mean | 65.5     |
|    agent/time/fps                    | 4.43e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.9         |
|    agent/rollout/ep_rew_wrapped_mean | -12.1        |
|    agent/time/fps                    | 4389         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0040380294 |
|    agent/train/clip_fraction         | 0.145        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.69        |
|    agent/train/explained_variance    | -0.19        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0233       |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.4         |
|    agent/rollout/ep_rew_wrapped_mean | -75          |
|    agent/time/fps                    | 4425         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0026319087 |
|    agent/train/clip_fraction         | 0.0813       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.689       |
|    agent/train/explained_variance    | -0.634       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.247        |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 38.9         |
|    agent/rollout/ep_rew_wrapped_mean | -122         |
|    agent/time/fps                    | 4429         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0011148024 |
|    agent/train/clip_fraction         | 0.0137       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.686       |
|    agent/train/explained_variance    | 0.619        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.21         |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 38.9         |
|    agent/rollout/ep_rew_wrapped_mean | -137         |
|    agent/time/fps                    | 4283         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0033820786 |
|    agent/train/clip_fraction         | 0.131        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.689       |
|    agent/train/explained_variance    | 0.919        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0904       |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 36.9         |
|    agent/rollout/ep_rew_wrapped_mean | -132         |
|    agent/time/fps                    | 3337         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0033207415 |
|    agent/train/clip_fraction         | 0.0764       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.688       |
|    agent/train/explained_variance    | 0.895        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0568       |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 35.6        |
|    agent/rollout/ep_rew_wrapped_mean | -130        |
|    agent/time/fps                    | 4420        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 14336       |
|    agent/train/approx_kl             | 0.003541926 |
|    agent/train/clip_fraction         | 0.159       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.686      |
|    agent/train/explained_variance    | 0.974       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0471      |
|    agent/train/n_updates             | 60          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.9         |
|    agent/rollout/ep_rew_wrapped_mean | -127         |
|    agent/time/fps                    | 4369         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0017399248 |
|    agent/train/clip_fraction         | 0.0563       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.69        |
|    agent/train/explained_variance    | 0.868        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.052        |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 35.2       |
|    agent/rollout/ep_rew_wrapped_mean | -118       |
|    agent/time/fps                    | 4179       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 18432      |
|    agent/train/approx_kl             | 0.00325342 |
|    agent/train/clip_fraction         | 0.156      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.681     |
|    agent/train/explained_variance    | 0.479      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.017      |
|    agent/train/n_updates             | 80         |
|    agent/train/policy_gradient_loss  | -0.00458 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 34.8         |
|    agent/rollout/ep_rew_wrapped_mean | -111         |
|    agent/time/fps                    | 4297         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0019941283 |
|    agent/train/clip_fraction         | 0.0948       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.69        |
|    agent/train/explained_variance    | 0.928        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00941      |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 36.3        |
|    agent/rollout/ep_rew_wrapped_mean | -104        |
|    agent/time/fps                    | 4263        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 22528       |
|    agent/train/approx_kl             | 0.002863322 |
|    agent/train/clip_fraction         | 0.129       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.687      |
|    agent/train/explained_variance    | 0.934       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00212     |
|    agent/train/n_updates             | 100         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 37.8         |
|    agent/rollout/ep_rew_wrapped_mean | -99.4        |
|    agent/time/fps                    | 4435         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0028832024 |
|    agent/train/clip_fraction         | 0.159        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.68        |
|    agent/train/explained_variance    | 0.965        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00829     |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 37           |
|    agent/rollout/ep_rew_wrapped_mean | -94.7        |
|    agent/time/fps                    | 2569         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 26624        |
|    agent/train/approx_kl             | 0.0030543674 |
|    agent/train/clip_fraction         | 0.161        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.682       |
|    agent/train/explained_variance    | 0.967        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00345      |
|    agent/train/n_updates             | 120          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 36.2         |
|    agent/rollout/ep_rew_wrapped_mean | -90.7        |
|    agent/time/fps                    | 4356         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 28672        |
|    agent/train/approx_kl             | 0.0036019457 |
|    agent/train/clip_fraction         | 0.195        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.68        |
|    agent/train/explained_variance    | 0.883        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0171      |
|    agent/train/n_updates             | 130          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 36.4         |
|    agent/rollout/ep_rew_wrapped_mean | -87.4        |
|    agent/time/fps                    | 4077         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0034778125 |
|    agent/train/clip_fraction         | 0.197        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.674       |
|    agent/train/explained_variance    | 0.982        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00177     |
|    agent/train/n_updates             | 140          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 37.3        |
|    agent/rollout/ep_rew_wrapped_mean | -84.3       |
|    agent/time/fps                    | 3490        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 32768       |
|    agent/train/approx_kl             | 0.003725281 |
|    agent/train/clip_fraction         | 0.197       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.674      |
|    agent/train/explained_variance    | 0.991       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0236     |
|    agent/train/n_updates             | 150         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 37.8         |
|    agent/rollout/ep_rew_wrapped_mean | -81          |
|    agent/time/fps                    | 4327         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0036780247 |
|    agent/train/clip_fraction         | 0.194        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.665       |
|    agent/train/explained_variance    | 0.977        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0196      |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 38.9         |
|    agent/rollout/ep_rew_wrapped_mean | -77.9        |
|    agent/time/fps                    | 4256         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 36864        |
|    agent/train/approx_kl             | 0.0038765701 |
|    agent/train/clip_fraction         | 0.191        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.649       |
|    agent/train/explained_variance    | 0.943        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0192      |
|    agent/train/n_updates             | 170          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 39.8         |
|    agent/rollout/ep_rew_wrapped_mean | -74.5        |
|    agent/time/fps                    | 3874         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 38912        |
|    agent/train/approx_kl             | 0.0031867176 |
|    agent/train/clip_fraction         | 0.188        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.652       |
|    agent/train/explained_variance    | 0.797        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00667     |
|    agent/train/n_updates             | 180          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 41.2         |
|    agent/rollout/ep_rew_wrapped_mean | -71.9        |
|    agent/time/fps                    | 4234         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 40960        |
|    agent/train/approx_kl             | 0.0043082056 |
|    agent/train/clip_fraction         | 0.27         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.648       |
|    agent/train/explained_variance    | 0.922        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00331     |
|    agent/train/n_updates             | 190          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
----------------------------------------------------
| raw/                                 |           |
|    agent/rollout/ep_len_mean         | 500       |
|    agent/rollout/ep_rew_mean         | 41.8      |
|    agent/rollout/ep_rew_wrapped_mean | -76.3     |
|    agent/time/fps                    | 4010      |
|    agent/time/iterations             | 1         |
|    agent/time/time_elapsed           | 0         |
|    agent/time/total_timesteps        | 43008     |
|    agent/train/approx_kl             | 0.0039007 |
|    agent/train/clip_fraction         | 0.239     |
|    agent/train/clip_range            | 0.1       |
|    agent/train/entropy_loss          | -0.64     |
|    agent/train/explained_variance    | 0.938     |
|    agent/train/learning_rate         | 0.002     |
|    agent/train/loss                  | -0.0275   |
|    agent/train/n_updates             | 200       |
|    agent/train/policy_gradient_loss  | -0.00794  |
|    agent/tra

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 42.2       |
|    agent/rollout/ep_rew_wrapped_mean | -72.3      |
|    agent/time/fps                    | 4405       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 45056      |
|    agent/train/approx_kl             | 0.00296179 |
|    agent/train/clip_fraction         | 0.148      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.636     |
|    agent/train/explained_variance    | 0.886      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.00389   |
|    agent/train/n_updates             | 210        |
|    agent/train/policy_gradient_loss  | -0.00503 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 42.6         |
|    agent/rollout/ep_rew_wrapped_mean | -62.1        |
|    agent/time/fps                    | 3811         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 47104        |
|    agent/train/approx_kl             | 0.0039512594 |
|    agent/train/clip_fraction         | 0.247        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.634       |
|    agent/train/explained_variance    | 0.9          |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00974     |
|    agent/train/n_updates             | 220          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 41.9         |
|    agent/rollout/ep_rew_wrapped_mean | -48.9        |
|    agent/time/fps                    | 4408         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 49152        |
|    agent/train/approx_kl             | 0.0027966402 |
|    agent/train/clip_fraction         | 0.115        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.627       |
|    agent/train/explained_variance    | 0.966        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00143      |
|    agent/train/n_updates             | 230          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 41.7         |
|    agent/rollout/ep_rew_wrapped_mean | -39.3        |
|    agent/time/fps                    | 4165         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 51200        |
|    agent/train/approx_kl             | 0.0022467838 |
|    agent/train/clip_fraction         | 0.109        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.618       |
|    agent/train/explained_variance    | 0.947        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00816     |
|    agent/train/n_updates             | 240          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 41.3         |
|    agent/rollout/ep_rew_wrapped_mean | -33.5        |
|    agent/time/fps                    | 3367         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 53248        |
|    agent/train/approx_kl             | 0.0031608203 |
|    agent/train/clip_fraction         | 0.114        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.628       |
|    agent/train/explained_variance    | 0.984        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00328      |
|    agent/train/n_updates             | 250          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 41.1         |
|    agent/rollout/ep_rew_wrapped_mean | -28.1        |
|    agent/time/fps                    | 4204         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 55296        |
|    agent/train/approx_kl             | 0.0027945905 |
|    agent/train/clip_fraction         | 0.123        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.625       |
|    agent/train/explained_variance    | 0.97         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.003        |
|    agent/train/n_updates             | 260          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 41          |
|    agent/rollout/ep_rew_wrapped_mean | -23.4       |
|    agent/time/fps                    | 4152        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 57344       |
|    agent/train/approx_kl             | 0.003344161 |
|    agent/train/clip_fraction         | 0.173       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.616      |
|    agent/train/explained_variance    | 0.973       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.000267    |
|    agent/train/n_updates             | 270         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 40.4         |
|    agent/rollout/ep_rew_wrapped_mean | -20.1        |
|    agent/time/fps                    | 4482         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 59392        |
|    agent/train/approx_kl             | 0.0022740993 |
|    agent/train/clip_fraction         | 0.116        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.599       |
|    agent/train/explained_variance    | 0.987        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.000646    |
|    agent/train/n_updates             | 280          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 39.7        |
|    agent/rollout/ep_rew_wrapped_mean | -16.5       |
|    agent/time/fps                    | 4437        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 61440       |
|    agent/train/approx_kl             | 0.004395831 |
|    agent/train/clip_fraction         | 0.179       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.59       |
|    agent/train/explained_variance    | 0.987       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00735    |
|    agent/train/n_updates             | 290         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 39.1         |
|    agent/rollout/ep_rew_wrapped_mean | -14.6        |
|    agent/time/fps                    | 4506         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 63488        |
|    agent/train/approx_kl             | 0.0023692406 |
|    agent/train/clip_fraction         | 0.118        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.589       |
|    agent/train/explained_variance    | 0.991        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0205       |
|    agent/train/n_updates             | 300          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 39.3       |
|    agent/rollout/ep_rew_wrapped_mean | -12.6      |
|    agent/time/fps                    | 4240       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 65536      |
|    agent/train/approx_kl             | 0.00224567 |
|    agent/train/clip_fraction         | 0.143      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.604     |
|    agent/train/explained_variance    | 0.955      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.0193     |
|    agent/train/n_updates             | 310        |
|    agent/train/policy_gradient_loss  | -0.00279 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 39.8         |
|    agent/rollout/ep_rew_wrapped_mean | -10.1        |
|    agent/time/fps                    | 4165         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 67584        |
|    agent/train/approx_kl             | 0.0027730793 |
|    agent/train/clip_fraction         | 0.114        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.577       |
|    agent/train/explained_variance    | 0.959        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0228       |
|    agent/train/n_updates             | 320          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 39.1         |
|    agent/rollout/ep_rew_wrapped_mean | -6.05        |
|    agent/time/fps                    | 3689         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 69632        |
|    agent/train/approx_kl             | 0.0025013727 |
|    agent/train/clip_fraction         | 0.1          |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.535       |
|    agent/train/explained_variance    | 0.982        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00827     |
|    agent/train/n_updates             | 330          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 39           |
|    agent/rollout/ep_rew_wrapped_mean | -2.56        |
|    agent/time/fps                    | 4264         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 71680        |
|    agent/train/approx_kl             | 0.0029172823 |
|    agent/train/clip_fraction         | 0.136        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.55        |
|    agent/train/explained_variance    | 0.988        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0012       |
|    agent/train/n_updates             | 340          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 38.2        |
|    agent/rollout/ep_rew_wrapped_mean | 0.309       |
|    agent/time/fps                    | 4200        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 73728       |
|    agent/train/approx_kl             | 0.002778193 |
|    agent/train/clip_fraction         | 0.13        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.536      |
|    agent/train/explained_variance    | 0.986       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0157     |
|    agent/train/n_updates             | 350         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 37.1         |
|    agent/rollout/ep_rew_wrapped_mean | 2.75         |
|    agent/time/fps                    | 4488         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 75776        |
|    agent/train/approx_kl             | 0.0046404637 |
|    agent/train/clip_fraction         | 0.172        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.502       |
|    agent/train/explained_variance    | 0.991        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0196      |
|    agent/train/n_updates             | 360          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 36.7        |
|    agent/rollout/ep_rew_wrapped_mean | 5.36        |
|    agent/time/fps                    | 4457        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 77824       |
|    agent/train/approx_kl             | 0.005752553 |
|    agent/train/clip_fraction         | 0.205       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.472      |
|    agent/train/explained_variance    | 0.99        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00448    |
|    agent/train/n_updates             | 370         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 36.2        |
|    agent/rollout/ep_rew_wrapped_mean | 5.85        |
|    agent/time/fps                    | 3266        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 79872       |
|    agent/train/approx_kl             | 0.003868531 |
|    agent/train/clip_fraction         | 0.174       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.535      |
|    agent/train/explained_variance    | 0.985       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0216     |
|    agent/train/n_updates             | 380         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 35.5         |
|    agent/rollout/ep_rew_wrapped_mean | 6.63         |
|    agent/time/fps                    | 4304         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 81920        |
|    agent/train/approx_kl             | 0.0035360726 |
|    agent/train/clip_fraction         | 0.118        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.527       |
|    agent/train/explained_variance    | 0.991        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0291       |
|    agent/train/n_updates             | 390          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 34.7        |
|    agent/rollout/ep_rew_wrapped_mean | 7.65        |
|    agent/time/fps                    | 4117        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 83968       |
|    agent/train/approx_kl             | 0.003027821 |
|    agent/train/clip_fraction         | 0.157       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.57       |
|    agent/train/explained_variance    | 0.995       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00108    |
|    agent/train/n_updates             | 400         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.7         |
|    agent/rollout/ep_rew_wrapped_mean | 5.61         |
|    agent/time/fps                    | 3369         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 86016        |
|    agent/train/approx_kl             | 0.0039299326 |
|    agent/train/clip_fraction         | 0.185        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.552       |
|    agent/train/explained_variance    | 0.994        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00583      |
|    agent/train/n_updates             | 410          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 32.6        |
|    agent/rollout/ep_rew_wrapped_mean | 1.83        |
|    agent/time/fps                    | 4100        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 88064       |
|    agent/train/approx_kl             | 0.005698686 |
|    agent/train/clip_fraction         | 0.185       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.577      |
|    agent/train/explained_variance    | 0.982       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0126      |
|    agent/train/n_updates             | 420         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.3         |
|    agent/rollout/ep_rew_wrapped_mean | -1.07        |
|    agent/time/fps                    | 4089         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 90112        |
|    agent/train/approx_kl             | 0.0034934215 |
|    agent/train/clip_fraction         | 0.188        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.572       |
|    agent/train/explained_variance    | 0.994        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0274       |
|    agent/train/n_updates             | 430          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29.2        |
|    agent/rollout/ep_rew_wrapped_mean | -2.15       |
|    agent/time/fps                    | 4451        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 92160       |
|    agent/train/approx_kl             | 0.004865283 |
|    agent/train/clip_fraction         | 0.187       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.53       |
|    agent/train/explained_variance    | 0.996       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0123      |
|    agent/train/n_updates             | 440         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 28          |
|    agent/rollout/ep_rew_wrapped_mean | -4.5        |
|    agent/time/fps                    | 4227        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 94208       |
|    agent/train/approx_kl             | 0.003869823 |
|    agent/train/clip_fraction         | 0.198       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.543      |
|    agent/train/explained_variance    | 0.993       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.39        |
|    agent/train/n_updates             | 450         |
|    agent/train/policy_gradient_



VBox(children=(Label(value='0.110 MB of 0.110 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▃▂▁▂▁▂▃▃▃▃▂▂▃▃▃▃▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇█
time/fps,█▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/approx_kl,▁▂▂▃▂▁▂▂▂▄▄▂▂▅▄▃█▄▄▃▄▃▃▂▅▃▃▅▃▄▆▅▅▃▃▃▄▃▅▅
train/clip_fraction,▁█▄▆▄▄▃▆▂▅▃▅▅▂▃▁█▅▄▂▆▄▂▄▆▃▄▅▅▅█▅▄▄▂▂▆▃▄▃
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▄▅▅▆▆▆█▆▇▅▇█▇█▆▇▇█▆▇▇▇▇▆▇▇▇▇▇▇▇▇▇▇▇▇█▇
train/explained_variance,▁███████████████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,74.66
time/fps,2839.0
train/approx_kl,0.00821
train/clip_fraction,0.13037
train/clip_range,0.1
train/entropy_loss,-0.31326
train/explained_variance,0.99989
train/learning_rate,0.002


100%|████████████████████████████████████████████| 5/5 [21:30<00:00, 258.02s/it]
  0%|                                                     | 0/5 [00:00<?, ?it/s]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011169081944454875, max=1.0…

Query schedule: [10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Collecting 20 fragments (4000 transitions)
Requested 2400 transitions but only 0 in buffer. Sampling 2400 additional transitions.
Sampling 1600 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 10 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 53.8     |
|    agent/rollout/ep_rew_wrapped_mean | 91.7     |
|    agent/time/fps                    | 4535     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 53.8     |
|    agent/rollout/ep_rew_wrapped_mean | 91.7     |
|    agent/time/fps                    | 4.54e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 49.9         |
|    agent/rollout/ep_rew_wrapped_mean | 54.5         |
|    agent/time/fps                    | 4193         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0023356574 |
|    agent/train/clip_fraction         | 0.151        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.691       |
|    agent/train/explained_variance    | 0.271        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0142      |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 45.3        |
|    agent/rollout/ep_rew_wrapped_mean | 30.9        |
|    agent/time/fps                    | 4480        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 6144        |
|    agent/train/approx_kl             | 0.002198691 |
|    agent/train/clip_fraction         | 0.039       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.69       |
|    agent/train/explained_variance    | 0.433       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.000469   |
|    agent/train/n_updates             | 20          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 37.9         |
|    agent/rollout/ep_rew_wrapped_mean | 12.6         |
|    agent/time/fps                    | 4432         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0029995935 |
|    agent/train/clip_fraction         | 0.156        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.689       |
|    agent/train/explained_variance    | 0.748        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0219       |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 35.5        |
|    agent/rollout/ep_rew_wrapped_mean | 1.69        |
|    agent/time/fps                    | 4382        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 10240       |
|    agent/train/approx_kl             | 0.002402438 |
|    agent/train/clip_fraction         | 0.0871      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.687      |
|    agent/train/explained_variance    | 0.439       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00678    |
|    agent/train/n_updates             | 40          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 37.8        |
|    agent/rollout/ep_rew_wrapped_mean | -6.38       |
|    agent/time/fps                    | 4488        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 12288       |
|    agent/train/approx_kl             | 0.002455973 |
|    agent/train/clip_fraction         | 0.153       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.681      |
|    agent/train/explained_variance    | 0.862       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00204    |
|    agent/train/n_updates             | 50          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 38.3         |
|    agent/rollout/ep_rew_wrapped_mean | -11.4        |
|    agent/time/fps                    | 4521         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0028490752 |
|    agent/train/clip_fraction         | 0.193        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.674       |
|    agent/train/explained_variance    | 0.745        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00826      |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 37          |
|    agent/rollout/ep_rew_wrapped_mean | -16.4       |
|    agent/time/fps                    | 4517        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 16384       |
|    agent/train/approx_kl             | 0.004043623 |
|    agent/train/clip_fraction         | 0.195       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.675      |
|    agent/train/explained_variance    | 0.451       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0178      |
|    agent/train/n_updates             | 70          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 34.9         |
|    agent/rollout/ep_rew_wrapped_mean | -29          |
|    agent/time/fps                    | 4521         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0033029977 |
|    agent/train/clip_fraction         | 0.101        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.678       |
|    agent/train/explained_variance    | 0.36         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0549       |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.3         |
|    agent/rollout/ep_rew_wrapped_mean | -40.6        |
|    agent/time/fps                    | 4429         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0020003251 |
|    agent/train/clip_fraction         | 0.0384       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.68        |
|    agent/train/explained_variance    | 0.812        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.098        |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.6         |
|    agent/rollout/ep_rew_wrapped_mean | -50          |
|    agent/time/fps                    | 4524         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0028119322 |
|    agent/train/clip_fraction         | 0.0827       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.674       |
|    agent/train/explained_variance    | 0.948        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0233       |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.5         |
|    agent/rollout/ep_rew_wrapped_mean | -55.4        |
|    agent/time/fps                    | 4459         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0037222912 |
|    agent/train/clip_fraction         | 0.231        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.664       |
|    agent/train/explained_variance    | 0.954        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0192       |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.4         |
|    agent/rollout/ep_rew_wrapped_mean | -60.3        |
|    agent/time/fps                    | 4480         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 26624        |
|    agent/train/approx_kl             | 0.0047798045 |
|    agent/train/clip_fraction         | 0.318        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.656       |
|    agent/train/explained_variance    | 0.962        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0114       |
|    agent/train/n_updates             | 120          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29.9        |
|    agent/rollout/ep_rew_wrapped_mean | -63         |
|    agent/time/fps                    | 4464        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 28672       |
|    agent/train/approx_kl             | 0.004083956 |
|    agent/train/clip_fraction         | 0.272       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.654      |
|    agent/train/explained_variance    | 0.869       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0412      |
|    agent/train/n_updates             | 130         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.4         |
|    agent/rollout/ep_rew_wrapped_mean | -65.8        |
|    agent/time/fps                    | 4133         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0033502379 |
|    agent/train/clip_fraction         | 0.204        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.673       |
|    agent/train/explained_variance    | 0.635        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.033        |
|    agent/train/n_updates             | 140          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.1         |
|    agent/rollout/ep_rew_wrapped_mean | -71.7        |
|    agent/time/fps                    | 4197         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0037766444 |
|    agent/train/clip_fraction         | 0.155        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.681       |
|    agent/train/explained_variance    | 0.348        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0304       |
|    agent/train/n_updates             | 150          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.4         |
|    agent/rollout/ep_rew_wrapped_mean | -77.4        |
|    agent/time/fps                    | 4326         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0022915888 |
|    agent/train/clip_fraction         | 0.0827       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.68        |
|    agent/train/explained_variance    | 0.617        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.31         |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.9         |
|    agent/rollout/ep_rew_wrapped_mean | -83.8        |
|    agent/time/fps                    | 4487         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 36864        |
|    agent/train/approx_kl             | 0.0025341269 |
|    agent/train/clip_fraction         | 0.0846       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.673       |
|    agent/train/explained_variance    | 0.895        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0479       |
|    agent/train/n_updates             | 170          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29.2        |
|    agent/rollout/ep_rew_wrapped_mean | -88.4       |
|    agent/time/fps                    | 4423        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 38912       |
|    agent/train/approx_kl             | 0.003410218 |
|    agent/train/clip_fraction         | 0.151       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.658      |
|    agent/train/explained_variance    | 0.951       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0739      |
|    agent/train/n_updates             | 180         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29.4        |
|    agent/rollout/ep_rew_wrapped_mean | -97.5       |
|    agent/time/fps                    | 3880        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 40960       |
|    agent/train/approx_kl             | 0.005227139 |
|    agent/train/clip_fraction         | 0.3         |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.643      |
|    agent/train/explained_variance    | 0.972       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0236      |
|    agent/train/n_updates             | 190         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.3         |
|    agent/rollout/ep_rew_wrapped_mean | -110         |
|    agent/time/fps                    | 4088         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0046904827 |
|    agent/train/clip_fraction         | 0.225        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.611       |
|    agent/train/explained_variance    | 0.97         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0112       |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.8         |
|    agent/rollout/ep_rew_wrapped_mean | -116         |
|    agent/time/fps                    | 3698         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 45056        |
|    agent/train/approx_kl             | 0.0058570504 |
|    agent/train/clip_fraction         | 0.198        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.565       |
|    agent/train/explained_variance    | 0.976        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0266       |
|    agent/train/n_updates             | 210          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30           |
|    agent/rollout/ep_rew_wrapped_mean | -122         |
|    agent/time/fps                    | 4294         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 47104        |
|    agent/train/approx_kl             | 0.0040868884 |
|    agent/train/clip_fraction         | 0.19         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.515       |
|    agent/train/explained_variance    | 0.547        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0297       |
|    agent/train/n_updates             | 220          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29.7        |
|    agent/rollout/ep_rew_wrapped_mean | -128        |
|    agent/time/fps                    | 4302        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 49152       |
|    agent/train/approx_kl             | 0.004831111 |
|    agent/train/clip_fraction         | 0.228       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.523      |
|    agent/train/explained_variance    | 0.973       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0187      |
|    agent/train/n_updates             | 230         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29.1        |
|    agent/rollout/ep_rew_wrapped_mean | -140        |
|    agent/time/fps                    | 3506        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 51200       |
|    agent/train/approx_kl             | 0.004032204 |
|    agent/train/clip_fraction         | 0.117       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.568      |
|    agent/train/explained_variance    | 0.921       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.113       |
|    agent/train/n_updates             | 240         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 27.7       |
|    agent/rollout/ep_rew_wrapped_mean | -153       |
|    agent/time/fps                    | 4491       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 53248      |
|    agent/train/approx_kl             | 0.00474785 |
|    agent/train/clip_fraction         | 0.126      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.548     |
|    agent/train/explained_variance    | 0.965      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.0949     |
|    agent/train/n_updates             | 250        |
|    agent/train/policy_gradient_loss  | -0.00277 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 26.5        |
|    agent/rollout/ep_rew_wrapped_mean | -169        |
|    agent/time/fps                    | 4382        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 55296       |
|    agent/train/approx_kl             | 0.002748787 |
|    agent/train/clip_fraction         | 0.0718      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.56       |
|    agent/train/explained_variance    | 0.979       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.303       |
|    agent/train/n_updates             | 260         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 25.7        |
|    agent/rollout/ep_rew_wrapped_mean | -187        |
|    agent/time/fps                    | 4479        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 57344       |
|    agent/train/approx_kl             | 0.004694392 |
|    agent/train/clip_fraction         | 0.146       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.553      |
|    agent/train/explained_variance    | 0.997       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0546      |
|    agent/train/n_updates             | 270         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.6         |
|    agent/rollout/ep_rew_wrapped_mean | -199         |
|    agent/time/fps                    | 4372         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 59392        |
|    agent/train/approx_kl             | 0.0014640683 |
|    agent/train/clip_fraction         | 0.052        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.567       |
|    agent/train/explained_variance    | 0.996        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0168       |
|    agent/train/n_updates             | 280          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.2         |
|    agent/rollout/ep_rew_wrapped_mean | -206         |
|    agent/time/fps                    | 4080         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 61440        |
|    agent/train/approx_kl             | 0.0015279537 |
|    agent/train/clip_fraction         | 0.0405       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.575       |
|    agent/train/explained_variance    | 0.981        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0915       |
|    agent/train/n_updates             | 290          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.7         |
|    agent/rollout/ep_rew_wrapped_mean | -210         |
|    agent/time/fps                    | 4431         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 63488        |
|    agent/train/approx_kl             | 0.0026463375 |
|    agent/train/clip_fraction         | 0.097        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.578       |
|    agent/train/explained_variance    | 0.975        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0969       |
|    agent/train/n_updates             | 300          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.1         |
|    agent/rollout/ep_rew_wrapped_mean | -215         |
|    agent/time/fps                    | 4029         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 65536        |
|    agent/train/approx_kl             | 0.0029822942 |
|    agent/train/clip_fraction         | 0.109        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.551       |
|    agent/train/explained_variance    | 0.975        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0593       |
|    agent/train/n_updates             | 310          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.2         |
|    agent/rollout/ep_rew_wrapped_mean | -216         |
|    agent/time/fps                    | 3977         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 67584        |
|    agent/train/approx_kl             | 0.0029388329 |
|    agent/train/clip_fraction         | 0.132        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.588       |
|    agent/train/explained_variance    | 0.91         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.113        |
|    agent/train/n_updates             | 320          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 23.6        |
|    agent/rollout/ep_rew_wrapped_mean | -217        |
|    agent/time/fps                    | 4411        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 69632       |
|    agent/train/approx_kl             | 0.004776692 |
|    agent/train/clip_fraction         | 0.202       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.602      |
|    agent/train/explained_variance    | 0.892       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0628      |
|    agent/train/n_updates             | 330         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.4         |
|    agent/rollout/ep_rew_wrapped_mean | -217         |
|    agent/time/fps                    | 4510         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 71680        |
|    agent/train/approx_kl             | 0.0027004504 |
|    agent/train/clip_fraction         | 0.128        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.621       |
|    agent/train/explained_variance    | 0.654        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.15         |
|    agent/train/n_updates             | 340          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 25.2        |
|    agent/rollout/ep_rew_wrapped_mean | -214        |
|    agent/time/fps                    | 4363        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 73728       |
|    agent/train/approx_kl             | 0.005436478 |
|    agent/train/clip_fraction         | 0.248       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.627      |
|    agent/train/explained_variance    | 0.737       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.095       |
|    agent/train/n_updates             | 350         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 25.7        |
|    agent/rollout/ep_rew_wrapped_mean | -210        |
|    agent/time/fps                    | 3962        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 75776       |
|    agent/train/approx_kl             | 0.002460146 |
|    agent/train/clip_fraction         | 0.104       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.628      |
|    agent/train/explained_variance    | 0.569       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.126       |
|    agent/train/n_updates             | 360         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 26.3        |
|    agent/rollout/ep_rew_wrapped_mean | -204        |
|    agent/time/fps                    | 4371        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 77824       |
|    agent/train/approx_kl             | 0.004118406 |
|    agent/train/clip_fraction         | 0.209       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.638      |
|    agent/train/explained_variance    | 0.842       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0192      |
|    agent/train/n_updates             | 370         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.9         |
|    agent/rollout/ep_rew_wrapped_mean | -199         |
|    agent/time/fps                    | 4475         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 79872        |
|    agent/train/approx_kl             | 0.0038465576 |
|    agent/train/clip_fraction         | 0.146        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.627       |
|    agent/train/explained_variance    | 0.846        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0858       |
|    agent/train/n_updates             | 380          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 27.2       |
|    agent/rollout/ep_rew_wrapped_mean | -194       |
|    agent/time/fps                    | 4356       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 81920      |
|    agent/train/approx_kl             | 0.00340539 |
|    agent/train/clip_fraction         | 0.174      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.632     |
|    agent/train/explained_variance    | 0.889      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.0433     |
|    agent/train/n_updates             | 390        |
|    agent/train/policy_gradient_loss  | -0.00424 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.2         |
|    agent/rollout/ep_rew_wrapped_mean | -190         |
|    agent/time/fps                    | 4154         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 83968        |
|    agent/train/approx_kl             | 0.0036314414 |
|    agent/train/clip_fraction         | 0.206        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.619       |
|    agent/train/explained_variance    | 0.931        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.051        |
|    agent/train/n_updates             | 400          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.7         |
|    agent/rollout/ep_rew_wrapped_mean | -187         |
|    agent/time/fps                    | 4461         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 86016        |
|    agent/train/approx_kl             | 0.0031068558 |
|    agent/train/clip_fraction         | 0.136        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.63        |
|    agent/train/explained_variance    | 0.873        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.137        |
|    agent/train/n_updates             | 410          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27           |
|    agent/rollout/ep_rew_wrapped_mean | -182         |
|    agent/time/fps                    | 4100         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 88064        |
|    agent/train/approx_kl             | 0.0028802808 |
|    agent/train/clip_fraction         | 0.147        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.649       |
|    agent/train/explained_variance    | 0.736        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0715       |
|    agent/train/n_updates             | 420          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.1         |
|    agent/rollout/ep_rew_wrapped_mean | -176         |
|    agent/time/fps                    | 4333         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 90112        |
|    agent/train/approx_kl             | 0.0023848352 |
|    agent/train/clip_fraction         | 0.107        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.654       |
|    agent/train/explained_variance    | 0.745        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0913       |
|    agent/train/n_updates             | 430          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 27.2        |
|    agent/rollout/ep_rew_wrapped_mean | -165        |
|    agent/time/fps                    | 3763        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 92160       |
|    agent/train/approx_kl             | 0.002788487 |
|    agent/train/clip_fraction         | 0.118       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.614      |
|    agent/train/explained_variance    | 0.855       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.114       |
|    agent/train/n_updates             | 440         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 27.4        |
|    agent/rollout/ep_rew_wrapped_mean | -152        |
|    agent/time/fps                    | 4430        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 94208       |
|    agent/train/approx_kl             | 0.003951132 |
|    agent/train/clip_fraction         | 0.102       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.607      |
|    agent/train/explained_variance    | 0.726       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.11        |
|    agent/train/n_updates             | 450         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27           |
|    agent/rollout/ep_rew_wrapped_mean | -137         |
|    agent/time/fps                    | 4268         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 96256        |
|    agent/train/approx_kl             | 0.0036564795 |
|    agent/train/clip_fraction         | 0.139        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.616       |
|    agent/train/explained_variance    | 0.84         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.119        |
|    agent/train/n_updates             | 460          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.6         |
|    agent/rollout/ep_rew_wrapped_mean | -119         |
|    agent/time/fps                    | 4509         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 98304        |
|    agent/train/approx_kl             | 0.0023607002 |
|    agent/train/clip_fraction         | 0.0694       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.607       |
|    agent/train/explained_variance    | 0.447        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.363        |
|    agent/train/n_updates             | 470          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.6         |
|    agent/rollout/ep_rew_wrapped_mean | -105         |
|    agent/time/fps                    | 4325         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 100352       |
|    agent/train/approx_kl             | 0.0060998583 |
|    agent/train/clip_fraction         | 0.246        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.616       |
|    agent/train/explained_variance    | 0.561        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.429        |
|    agent/train/n_updates             | 480          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27           |
|    agent/rollout/ep_rew_wrapped_mean | -98.4        |
|    agent/time/fps                    | 4493         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 102400       |
|    agent/train/approx_kl             | 0.0039604376 |
|    agent/train/clip_fraction         | 0.153        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.601       |
|    agent/train/explained_variance    | 0.566        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.198        |
|    agent/train/n_updates             | 490          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.2         |
|    agent/rollout/ep_rew_wrapped_mean | -91.1        |
|    agent/time/fps                    | 4424         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 104448       |
|    agent/train/approx_kl             | 0.0029738445 |
|    agent/train/clip_fraction         | 0.0956       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.528       |
|    agent/train/explained_variance    | 0.645        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.607        |
|    agent/train/n_updates             | 500          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 27.3        |
|    agent/rollout/ep_rew_wrapped_mean | -86.2       |
|    agent/time/fps                    | 4457        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 106496      |
|    agent/train/approx_kl             | 0.002273971 |
|    agent/train/clip_fraction         | 0.0757      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.521      |
|    agent/train/explained_variance    | 0.776       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.264       |
|    agent/train/n_updates             | 510         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 27.2        |
|    agent/rollout/ep_rew_wrapped_mean | -85.8       |
|    agent/time/fps                    | 4505        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 108544      |
|    agent/train/approx_kl             | 0.004124728 |
|    agent/train/clip_fraction         | 0.119       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.501      |
|    agent/train/explained_variance    | 0.693       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.465       |
|    agent/train/n_updates             | 520         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.5         |
|    agent/rollout/ep_rew_wrapped_mean | -84.5        |
|    agent/time/fps                    | 4136         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 110592       |
|    agent/train/approx_kl             | 0.0032040598 |
|    agent/train/clip_fraction         | 0.0917       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.384       |
|    agent/train/explained_variance    | 0.891        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.114        |
|    agent/train/n_updates             | 530          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.6         |
|    agent/rollout/ep_rew_wrapped_mean | -86.5        |
|    agent/time/fps                    | 4517         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 112640       |
|    agent/train/approx_kl             | 0.0031689345 |
|    agent/train/clip_fraction         | 0.0848       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.427       |
|    agent/train/explained_variance    | 0.774        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.177        |
|    agent/train/n_updates             | 540          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 28.1        |
|    agent/rollout/ep_rew_wrapped_mean | -86.4       |
|    agent/time/fps                    | 4437        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 114688      |
|    agent/train/approx_kl             | 0.008053248 |
|    agent/train/clip_fraction         | 0.0766      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.273      |
|    agent/train/explained_variance    | 0.975       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0934      |
|    agent/train/n_updates             | 550         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 28          |
|    agent/rollout/ep_rew_wrapped_mean | -83.2       |
|    agent/time/fps                    | 4169        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 116736      |
|    agent/train/approx_kl             | 0.004246504 |
|    agent/train/clip_fraction         | 0.0803      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.347      |
|    agent/train/explained_variance    | 0.931       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.133       |
|    agent/train/n_updates             | 560         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.4         |
|    agent/rollout/ep_rew_wrapped_mean | -81.6        |
|    agent/time/fps                    | 4513         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 118784       |
|    agent/train/approx_kl             | 0.0063863983 |
|    agent/train/clip_fraction         | 0.0616       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.181       |
|    agent/train/explained_variance    | 0.899        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.12         |
|    agent/train/n_updates             | 570          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27           |
|    agent/rollout/ep_rew_wrapped_mean | -81.5        |
|    agent/time/fps                    | 4356         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 120832       |
|    agent/train/approx_kl             | 0.0011661502 |
|    agent/train/clip_fraction         | 0.0688       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.284       |
|    agent/train/explained_variance    | 0.948        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.105        |
|    agent/train/n_updates             | 580          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 103 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 25.9        |
|    agent/rollout/ep_rew_wrapped_mean | -84.8       |
|    agent/time/fps                    | 4061        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 122880      |
|    agent/train/approx_kl             | 0.004670658 |
|    agent/train/clip_fraction         | 0.101       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.312      |
|    agent/train/explained_variance    | 0.958       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0644      |
|    agent/train/n_updates             | 590         |
|    agent/train/policy_gradient

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 25.1        |
|    agent/rollout/ep_rew_wrapped_mean | -88.2       |
|    agent/time/fps                    | 4246        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 124928      |
|    agent/train/approx_kl             | 0.005314799 |
|    agent/train/clip_fraction         | 0.0645      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.268      |
|    agent/train/explained_variance    | 0.987       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.123       |
|    agent/train/n_updates             | 600         |
|    agent/train/policy_gradient_



VBox(children=(Label(value='0.160 MB of 0.160 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂▃▄▅▇█▇▆▄▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
time/fps,█▁▂▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅
train/approx_kl,▂█▃▂▁▇▁▅▂▂▃▂▂▁▂▂▂▃▂▂▃▂▄▄▃▄▂▂▄▃▆▄▂▆▄▂▃▄▄▅
train/clip_fraction,█▆▃▁▁▁▁▂▃▁▂▃▂▂▃▃▆▇▆▅▅▅█▅▅▄▃▃▃▂▄▂▂▃▃▃▃▃▄▄
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▄▄▆▇██▇▆▇▇▇▇▇▆▆▄▃▃▃▃▃▄▅▅▆▇▇▇█▇██▇▇▇▇▇▇▇
train/explained_variance,▁▇███████████████████▅▇█████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,14.31
time/fps,2756.0
train/approx_kl,0.01039
train/clip_fraction,0.11855
train/clip_range,0.1
train/entropy_loss,-0.17516
train/explained_variance,0.99631
train/learning_rate,0.002


 20%|████████▊                                   | 1/5 [04:46<19:06, 286.69s/it]

Query schedule: [10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Collecting 20 fragments (4000 transitions)
Requested 2400 transitions but only 0 in buffer. Sampling 2400 additional transitions.
Sampling 1600 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 10 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 35.5     |
|    agent/rollout/ep_rew_wrapped_mean | 358      |
|    agent/time/fps                    | 4426     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 35.5     |
|    agent/rollout/ep_rew_wrapped_mean | 358      |
|    agent/time/fps                    | 4.43e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.8         |
|    agent/rollout/ep_rew_wrapped_mean | 262          |
|    agent/time/fps                    | 4306         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0024335561 |
|    agent/train/clip_fraction         | 0.0708       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.691       |
|    agent/train/explained_variance    | -0.125       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0578       |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 35           |
|    agent/rollout/ep_rew_wrapped_mean | 211          |
|    agent/time/fps                    | 4450         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0024478482 |
|    agent/train/clip_fraction         | 0.0553       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.685       |
|    agent/train/explained_variance    | 0.899        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00126     |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 34.7         |
|    agent/rollout/ep_rew_wrapped_mean | 179          |
|    agent/time/fps                    | 4476         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0023867008 |
|    agent/train/clip_fraction         | 0.134        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.683       |
|    agent/train/explained_variance    | 0.976        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00804      |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 34.4         |
|    agent/rollout/ep_rew_wrapped_mean | 159          |
|    agent/time/fps                    | 3834         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0014392203 |
|    agent/train/clip_fraction         | 0.0368       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.685       |
|    agent/train/explained_variance    | 0.94         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0132       |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.1         |
|    agent/rollout/ep_rew_wrapped_mean | 147          |
|    agent/time/fps                    | 4455         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0018506132 |
|    agent/train/clip_fraction         | 0.0639       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.679       |
|    agent/train/explained_variance    | 0.927        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0332       |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.4         |
|    agent/rollout/ep_rew_wrapped_mean | 137          |
|    agent/time/fps                    | 4456         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0032818257 |
|    agent/train/clip_fraction         | 0.176        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.66        |
|    agent/train/explained_variance    | 0.966        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00555      |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.8         |
|    agent/rollout/ep_rew_wrapped_mean | 131          |
|    agent/time/fps                    | 4477         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0020692062 |
|    agent/train/clip_fraction         | 0.0839       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.669       |
|    agent/train/explained_variance    | 0.954        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00836      |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.8         |
|    agent/rollout/ep_rew_wrapped_mean | 127          |
|    agent/time/fps                    | 4490         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0022597662 |
|    agent/train/clip_fraction         | 0.109        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.667       |
|    agent/train/explained_variance    | 0.945        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0298       |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.4         |
|    agent/rollout/ep_rew_wrapped_mean | 126          |
|    agent/time/fps                    | 4319         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0029052326 |
|    agent/train/clip_fraction         | 0.22         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.675       |
|    agent/train/explained_variance    | 0.949        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0276       |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.7         |
|    agent/rollout/ep_rew_wrapped_mean | 132          |
|    agent/time/fps                    | 4090         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0027105273 |
|    agent/train/clip_fraction         | 0.156        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.664       |
|    agent/train/explained_variance    | 0.936        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.033        |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.7         |
|    agent/rollout/ep_rew_wrapped_mean | 142          |
|    agent/time/fps                    | 4329         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0035929303 |
|    agent/train/clip_fraction         | 0.186        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.66        |
|    agent/train/explained_variance    | 0.947        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0222       |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.3         |
|    agent/rollout/ep_rew_wrapped_mean | 154          |
|    agent/time/fps                    | 4320         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 26624        |
|    agent/train/approx_kl             | 0.0018331717 |
|    agent/train/clip_fraction         | 0.0818       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.659       |
|    agent/train/explained_variance    | 0.889        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.105        |
|    agent/train/n_updates             | 120          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.6         |
|    agent/rollout/ep_rew_wrapped_mean | 155          |
|    agent/time/fps                    | 4213         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 28672        |
|    agent/train/approx_kl             | 0.0020984868 |
|    agent/train/clip_fraction         | 0.0912       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.657       |
|    agent/train/explained_variance    | 0.96         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0297       |
|    agent/train/n_updates             | 130          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.1         |
|    agent/rollout/ep_rew_wrapped_mean | 148          |
|    agent/time/fps                    | 4484         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0024229013 |
|    agent/train/clip_fraction         | 0.0778       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.654       |
|    agent/train/explained_variance    | 0.547        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.23         |
|    agent/train/n_updates             | 140          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.2         |
|    agent/rollout/ep_rew_wrapped_mean | 141          |
|    agent/time/fps                    | 4096         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0016623749 |
|    agent/train/clip_fraction         | 0.0698       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.651       |
|    agent/train/explained_variance    | 0.552        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0564       |
|    agent/train/n_updates             | 150          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.3         |
|    agent/rollout/ep_rew_wrapped_mean | 133          |
|    agent/time/fps                    | 4239         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0020488102 |
|    agent/train/clip_fraction         | 0.0646       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.647       |
|    agent/train/explained_variance    | 0.64         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.115        |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.7         |
|    agent/rollout/ep_rew_wrapped_mean | 129          |
|    agent/time/fps                    | 4297         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 36864        |
|    agent/train/approx_kl             | 0.0032078996 |
|    agent/train/clip_fraction         | 0.174        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.644       |
|    agent/train/explained_variance    | 0.845        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00649      |
|    agent/train/n_updates             | 170          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.9         |
|    agent/rollout/ep_rew_wrapped_mean | 125          |
|    agent/time/fps                    | 4490         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 38912        |
|    agent/train/approx_kl             | 0.0029509887 |
|    agent/train/clip_fraction         | 0.187        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.633       |
|    agent/train/explained_variance    | 0.92         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00274     |
|    agent/train/n_updates             | 180          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.3         |
|    agent/rollout/ep_rew_wrapped_mean | 113          |
|    agent/time/fps                    | 4462         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 40960        |
|    agent/train/approx_kl             | 0.0031159515 |
|    agent/train/clip_fraction         | 0.148        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.632       |
|    agent/train/explained_variance    | 0.812        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0473       |
|    agent/train/n_updates             | 190          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 23          |
|    agent/rollout/ep_rew_wrapped_mean | 98.2        |
|    agent/time/fps                    | 4470        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 43008       |
|    agent/train/approx_kl             | 0.003012566 |
|    agent/train/clip_fraction         | 0.168       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.598      |
|    agent/train/explained_variance    | 0.776       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00183     |
|    agent/train/n_updates             | 200         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 22.9        |
|    agent/rollout/ep_rew_wrapped_mean | 97.6        |
|    agent/time/fps                    | 4500        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 45056       |
|    agent/train/approx_kl             | 0.003963575 |
|    agent/train/clip_fraction         | 0.217       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.577      |
|    agent/train/explained_variance    | 0.626       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00398    |
|    agent/train/n_updates             | 210         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23           |
|    agent/rollout/ep_rew_wrapped_mean | 98.3         |
|    agent/time/fps                    | 4503         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 47104        |
|    agent/train/approx_kl             | 0.0036563794 |
|    agent/train/clip_fraction         | 0.163        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.556       |
|    agent/train/explained_variance    | 0.768        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.000717     |
|    agent/train/n_updates             | 220          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23           |
|    agent/rollout/ep_rew_wrapped_mean | 99.4         |
|    agent/time/fps                    | 4318         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 49152        |
|    agent/train/approx_kl             | 0.0034275725 |
|    agent/train/clip_fraction         | 0.161        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.545       |
|    agent/train/explained_variance    | 0.897        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0026       |
|    agent/train/n_updates             | 230          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23           |
|    agent/rollout/ep_rew_wrapped_mean | 101          |
|    agent/time/fps                    | 4322         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 51200        |
|    agent/train/approx_kl             | 0.0043272926 |
|    agent/train/clip_fraction         | 0.186        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.539       |
|    agent/train/explained_variance    | 0.958        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0232      |
|    agent/train/n_updates             | 240          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.4         |
|    agent/rollout/ep_rew_wrapped_mean | 101          |
|    agent/time/fps                    | 4019         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 53248        |
|    agent/train/approx_kl             | 0.0040860022 |
|    agent/train/clip_fraction         | 0.186        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.518       |
|    agent/train/explained_variance    | 0.967        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0044      |
|    agent/train/n_updates             | 250          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 22.2        |
|    agent/rollout/ep_rew_wrapped_mean | 102         |
|    agent/time/fps                    | 2273        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 55296       |
|    agent/train/approx_kl             | 0.004216755 |
|    agent/train/clip_fraction         | 0.177       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.503      |
|    agent/train/explained_variance    | 0.96        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00124     |
|    agent/train/n_updates             | 260         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.4         |
|    agent/rollout/ep_rew_wrapped_mean | 103          |
|    agent/time/fps                    | 4294         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 57344        |
|    agent/train/approx_kl             | 0.0052171615 |
|    agent/train/clip_fraction         | 0.208        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.492       |
|    agent/train/explained_variance    | 0.967        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00562     |
|    agent/train/n_updates             | 270          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.8         |
|    agent/rollout/ep_rew_wrapped_mean | 103          |
|    agent/time/fps                    | 4161         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 59392        |
|    agent/train/approx_kl             | 0.0044775805 |
|    agent/train/clip_fraction         | 0.163        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.478       |
|    agent/train/explained_variance    | 0.979        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00566     |
|    agent/train/n_updates             | 280          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 20.7        |
|    agent/rollout/ep_rew_wrapped_mean | 103         |
|    agent/time/fps                    | 3659        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 61440       |
|    agent/train/approx_kl             | 0.007045488 |
|    agent/train/clip_fraction         | 0.191       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.474      |
|    agent/train/explained_variance    | 0.972       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0152     |
|    agent/train/n_updates             | 290         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.8         |
|    agent/rollout/ep_rew_wrapped_mean | 99           |
|    agent/time/fps                    | 4027         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 63488        |
|    agent/train/approx_kl             | 0.0072145527 |
|    agent/train/clip_fraction         | 0.169        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.444       |
|    agent/train/explained_variance    | 0.965        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00413     |
|    agent/train/n_updates             | 300          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.9         |
|    agent/rollout/ep_rew_wrapped_mean | 91.8         |
|    agent/time/fps                    | 4430         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 65536        |
|    agent/train/approx_kl             | 0.0064164735 |
|    agent/train/clip_fraction         | 0.15         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.402       |
|    agent/train/explained_variance    | 0.926        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00146     |
|    agent/train/n_updates             | 310          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.7         |
|    agent/rollout/ep_rew_wrapped_mean | 82.7         |
|    agent/time/fps                    | 4260         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 67584        |
|    agent/train/approx_kl             | 0.0034706378 |
|    agent/train/clip_fraction         | 0.11         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.359       |
|    agent/train/explained_variance    | 0.936        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.01        |
|    agent/train/n_updates             | 320          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.4         |
|    agent/rollout/ep_rew_wrapped_mean | 81.5         |
|    agent/time/fps                    | 4261         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 69632        |
|    agent/train/approx_kl             | 0.0049747946 |
|    agent/train/clip_fraction         | 0.113        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.376       |
|    agent/train/explained_variance    | 0.948        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0205       |
|    agent/train/n_updates             | 330          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.1         |
|    agent/rollout/ep_rew_wrapped_mean | 86.2         |
|    agent/time/fps                    | 4260         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 71680        |
|    agent/train/approx_kl             | 0.0049546584 |
|    agent/train/clip_fraction         | 0.122        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.337       |
|    agent/train/explained_variance    | 0.97         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00516     |
|    agent/train/n_updates             | 340          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.1         |
|    agent/rollout/ep_rew_wrapped_mean | 92.5         |
|    agent/time/fps                    | 3639         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 73728        |
|    agent/train/approx_kl             | 0.0026692948 |
|    agent/train/clip_fraction         | 0.123        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.3         |
|    agent/train/explained_variance    | 0.97         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0198      |
|    agent/train/n_updates             | 350          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.3         |
|    agent/rollout/ep_rew_wrapped_mean | 99.9         |
|    agent/time/fps                    | 4155         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 75776        |
|    agent/train/approx_kl             | 0.0059550563 |
|    agent/train/clip_fraction         | 0.124        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.272       |
|    agent/train/explained_variance    | 0.978        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00875      |
|    agent/train/n_updates             | 360          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.2         |
|    agent/rollout/ep_rew_wrapped_mean | 106          |
|    agent/time/fps                    | 4221         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 77824        |
|    agent/train/approx_kl             | 0.0028477367 |
|    agent/train/clip_fraction         | 0.096        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.262       |
|    agent/train/explained_variance    | 0.98         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0101       |
|    agent/train/n_updates             | 370          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22           |
|    agent/rollout/ep_rew_wrapped_mean | 111          |
|    agent/time/fps                    | 4375         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 79872        |
|    agent/train/approx_kl             | 0.0047437716 |
|    agent/train/clip_fraction         | 0.119        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.251       |
|    agent/train/explained_variance    | 0.959        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00551      |
|    agent/train/n_updates             | 380          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 22.3        |
|    agent/rollout/ep_rew_wrapped_mean | 117         |
|    agent/time/fps                    | 4298        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 81920       |
|    agent/train/approx_kl             | 0.003073784 |
|    agent/train/clip_fraction         | 0.0932      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.237      |
|    agent/train/explained_variance    | 0.975       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.000854    |
|    agent/train/n_updates             | 390         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 23.1        |
|    agent/rollout/ep_rew_wrapped_mean | 122         |
|    agent/time/fps                    | 4170        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 83968       |
|    agent/train/approx_kl             | 0.004411246 |
|    agent/train/clip_fraction         | 0.0747      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.215      |
|    agent/train/explained_variance    | 0.95        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00964    |
|    agent/train/n_updates             | 400         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.2         |
|    agent/rollout/ep_rew_wrapped_mean | 127          |
|    agent/time/fps                    | 4445         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 86016        |
|    agent/train/approx_kl             | 0.0022316286 |
|    agent/train/clip_fraction         | 0.0838       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.223       |
|    agent/train/explained_variance    | 0.983        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.000883     |
|    agent/train/n_updates             | 410          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.2         |
|    agent/rollout/ep_rew_wrapped_mean | 132          |
|    agent/time/fps                    | 3535         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 88064        |
|    agent/train/approx_kl             | 0.0027734172 |
|    agent/train/clip_fraction         | 0.0667       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.198       |
|    agent/train/explained_variance    | 0.982        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00825      |
|    agent/train/n_updates             | 420          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 26.3        |
|    agent/rollout/ep_rew_wrapped_mean | 136         |
|    agent/time/fps                    | 4496        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 90112       |
|    agent/train/approx_kl             | 0.002323804 |
|    agent/train/clip_fraction         | 0.0628      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.216      |
|    agent/train/explained_variance    | 0.982       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00307    |
|    agent/train/n_updates             | 430         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.2         |
|    agent/rollout/ep_rew_wrapped_mean | 141          |
|    agent/time/fps                    | 4375         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 92160        |
|    agent/train/approx_kl             | 0.0013246543 |
|    agent/train/clip_fraction         | 0.065        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.189       |
|    agent/train/explained_variance    | 0.977        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00681     |
|    agent/train/n_updates             | 440          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 28.1        |
|    agent/rollout/ep_rew_wrapped_mean | 146         |
|    agent/time/fps                    | 4026        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 94208       |
|    agent/train/approx_kl             | 0.002200279 |
|    agent/train/clip_fraction         | 0.0563      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.171      |
|    agent/train/explained_variance    | 0.99        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00716    |
|    agent/train/n_updates             | 450         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.1         |
|    agent/rollout/ep_rew_wrapped_mean | 151          |
|    agent/time/fps                    | 4325         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 96256        |
|    agent/train/approx_kl             | 0.0031705515 |
|    agent/train/clip_fraction         | 0.0758       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.18        |
|    agent/train/explained_variance    | 0.989        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.000453     |
|    agent/train/n_updates             | 460          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 30.1        |
|    agent/rollout/ep_rew_wrapped_mean | 154         |
|    agent/time/fps                    | 4376        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 98304       |
|    agent/train/approx_kl             | 0.002047067 |
|    agent/train/clip_fraction         | 0.0723      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.183      |
|    agent/train/explained_variance    | 0.985       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00688     |
|    agent/train/n_updates             | 470         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.7         |
|    agent/rollout/ep_rew_wrapped_mean | 157          |
|    agent/time/fps                    | 4368         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 100352       |
|    agent/train/approx_kl             | 0.0051029962 |
|    agent/train/clip_fraction         | 0.133        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.225       |
|    agent/train/explained_variance    | 0.87         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0109      |
|    agent/train/n_updates             | 480          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 31.5        |
|    agent/rollout/ep_rew_wrapped_mean | 160         |
|    agent/time/fps                    | 4469        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 102400      |
|    agent/train/approx_kl             | 0.002252767 |
|    agent/train/clip_fraction         | 0.0832      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.211      |
|    agent/train/explained_variance    | 0.971       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00507    |
|    agent/train/n_updates             | 490         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.5         |
|    agent/rollout/ep_rew_wrapped_mean | 163          |
|    agent/time/fps                    | 4112         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 104448       |
|    agent/train/approx_kl             | 0.0022660117 |
|    agent/train/clip_fraction         | 0.0686       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.189       |
|    agent/train/explained_variance    | 0.988        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0173       |
|    agent/train/n_updates             | 500          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.4         |
|    agent/rollout/ep_rew_wrapped_mean | 165          |
|    agent/time/fps                    | 3897         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 106496       |
|    agent/train/approx_kl             | 0.0057421513 |
|    agent/train/clip_fraction         | 0.0805       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.204       |
|    agent/train/explained_variance    | 0.989        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00776     |
|    agent/train/n_updates             | 510          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 34.5         |
|    agent/rollout/ep_rew_wrapped_mean | 168          |
|    agent/time/fps                    | 4315         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 108544       |
|    agent/train/approx_kl             | 0.0023309118 |
|    agent/train/clip_fraction         | 0.0755       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.212       |
|    agent/train/explained_variance    | 0.991        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00122     |
|    agent/train/n_updates             | 520          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 35.9        |
|    agent/rollout/ep_rew_wrapped_mean | 170         |
|    agent/time/fps                    | 4132        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 110592      |
|    agent/train/approx_kl             | 0.004178551 |
|    agent/train/clip_fraction         | 0.0892      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.208      |
|    agent/train/explained_variance    | 0.979       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00882    |
|    agent/train/n_updates             | 530         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 36.1        |
|    agent/rollout/ep_rew_wrapped_mean | 171         |
|    agent/time/fps                    | 4510        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 112640      |
|    agent/train/approx_kl             | 0.004159036 |
|    agent/train/clip_fraction         | 0.0923      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.221      |
|    agent/train/explained_variance    | 0.981       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0102     |
|    agent/train/n_updates             | 540         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 36.5         |
|    agent/rollout/ep_rew_wrapped_mean | 172          |
|    agent/time/fps                    | 3190         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 114688       |
|    agent/train/approx_kl             | 0.0063300924 |
|    agent/train/clip_fraction         | 0.102        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.223       |
|    agent/train/explained_variance    | 0.984        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0032      |
|    agent/train/n_updates             | 550          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 37.2        |
|    agent/rollout/ep_rew_wrapped_mean | 172         |
|    agent/time/fps                    | 4253        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 116736      |
|    agent/train/approx_kl             | 0.004611421 |
|    agent/train/clip_fraction         | 0.112       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.241      |
|    agent/train/explained_variance    | 0.993       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00491     |
|    agent/train/n_updates             | 560         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 37.9        |
|    agent/rollout/ep_rew_wrapped_mean | 171         |
|    agent/time/fps                    | 4435        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 118784      |
|    agent/train/approx_kl             | 0.005531354 |
|    agent/train/clip_fraction         | 0.105       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.259      |
|    agent/train/explained_variance    | 0.994       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00253    |
|    agent/train/n_updates             | 570         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 38.4        |
|    agent/rollout/ep_rew_wrapped_mean | 170         |
|    agent/time/fps                    | 4277        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 120832      |
|    agent/train/approx_kl             | 0.004067379 |
|    agent/train/clip_fraction         | 0.104       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.24       |
|    agent/train/explained_variance    | 0.995       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00212     |
|    agent/train/n_updates             | 580         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 103 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 39.3         |
|    agent/rollout/ep_rew_wrapped_mean | 168          |
|    agent/time/fps                    | 4060         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 122880       |
|    agent/train/approx_kl             | 0.0138618965 |
|    agent/train/clip_fraction         | 0.167        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.264       |
|    agent/train/explained_variance    | 0.995        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00311     |
|    agent/train/n_updates             | 590          |
|    agent/trai

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 39.9        |
|    agent/rollout/ep_rew_wrapped_mean | 167         |
|    agent/time/fps                    | 3871        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 124928      |
|    agent/train/approx_kl             | 0.004864238 |
|    agent/train/clip_fraction         | 0.103       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.276      |
|    agent/train/explained_variance    | 0.996       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00375    |
|    agent/train/n_updates             | 600         |
|    agent/train/policy_gradient_



VBox(children=(Label(value='0.110 MB of 0.110 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▁▁▁▁▁▁▁▁▂▂▄▆▇███████████████████████████
time/fps,█▁▂▂▂▂▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
train/approx_kl,▂▂▂▁▁▂▁▁▂▃▆▂▃▂▃█▂▂▁▃▂▂▁▂▂▂▁▂▂▁▁▂▁▁▁▁▄▄▂▂
train/clip_fraction,▃▄▃▁▁▃▂▂▄▅█▄▅▃▃▅▃▄▃▃▂▃▃▃▂▂▂▃▃▂▁▂▁▂▂▂▅▄▂▂
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▃▅█▇▅▅█▃▃▅▆▆▆▇▆▇▆▇▇▇▆▇▇▇▇▇███▇▇█▇▇▇▆▆▇▇
train/explained_variance,▁▄▇▇▂▇▇█▇███▇█▆▇▇▅▇▇▇▇▇█▆▇█▇▇█▇█▇████▆▇█
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,500.0
time/fps,2797.0
train/approx_kl,0.00377
train/clip_fraction,0.11675
train/clip_range,0.1
train/entropy_loss,-0.29476
train/explained_variance,0.99828
train/learning_rate,0.002


 40%|█████████████████▌                          | 2/5 [09:31<14:16, 285.66s/it]

Query schedule: [10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Collecting 20 fragments (4000 transitions)
Requested 2400 transitions but only 0 in buffer. Sampling 2400 additional transitions.
Sampling 1600 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 10 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 31.8     |
|    agent/rollout/ep_rew_wrapped_mean | -34.1    |
|    agent/time/fps                    | 3819     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 31.8     |
|    agent/rollout/ep_rew_wrapped_mean | -34.1    |
|    agent/time/fps                    | 3.82e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.4         |
|    agent/rollout/ep_rew_wrapped_mean | -49.4        |
|    agent/time/fps                    | 3743         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0032932002 |
|    agent/train/clip_fraction         | 0.138        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.69        |
|    agent/train/explained_variance    | 0.219        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00916     |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
----------------------------------------------------
| raw/                                 |           |
|    agent/rollout/ep_len_mean         | 500       |
|    agent/rollout/ep_rew_mean         | 26.7      |
|    agent/rollout/ep_rew_wrapped_mean | -56.4     |
|    agent/time/fps                    | 4046      |
|    agent/time/iterations             | 1         |
|    agent/time/time_elapsed           | 0         |
|    agent/time/total_timesteps        | 6144      |
|    agent/train/approx_kl             | 0.0033607 |
|    agent/train/clip_fraction         | 0.133     |
|    agent/train/clip_range            | 0.1       |
|    agent/train/entropy_loss          | -0.684    |
|    agent/train/explained_variance    | -0.451    |
|    agent/train/learning_rate         | 0.002     |
|    agent/train/loss                  | 0.0273    |
|    agent/train/n_updates             | 20        |
|    agent/train/policy_gradient_loss  | -0.00435  |
|    agent/tra

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.8         |
|    agent/rollout/ep_rew_wrapped_mean | -59.2        |
|    agent/time/fps                    | 4092         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0029483037 |
|    agent/train/clip_fraction         | 0.128        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.678       |
|    agent/train/explained_variance    | 0.423        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00881      |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.9         |
|    agent/rollout/ep_rew_wrapped_mean | -60.3        |
|    agent/time/fps                    | 4070         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0041466923 |
|    agent/train/clip_fraction         | 0.213        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.672       |
|    agent/train/explained_variance    | 0.843        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00138     |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.4         |
|    agent/rollout/ep_rew_wrapped_mean | -61.2        |
|    agent/time/fps                    | 4357         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0027082458 |
|    agent/train/clip_fraction         | 0.136        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.673       |
|    agent/train/explained_variance    | 0.736        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00382      |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.3         |
|    agent/rollout/ep_rew_wrapped_mean | -62.3        |
|    agent/time/fps                    | 4059         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0026953178 |
|    agent/train/clip_fraction         | 0.146        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.677       |
|    agent/train/explained_variance    | 0.929        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00252      |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.2         |
|    agent/rollout/ep_rew_wrapped_mean | -62.7        |
|    agent/time/fps                    | 3911         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0039809057 |
|    agent/train/clip_fraction         | 0.237        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.671       |
|    agent/train/explained_variance    | 0.877        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0199      |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.2         |
|    agent/rollout/ep_rew_wrapped_mean | -63.4        |
|    agent/time/fps                    | 4354         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0033630491 |
|    agent/train/clip_fraction         | 0.213        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.679       |
|    agent/train/explained_variance    | 0.818        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0147      |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 19.4        |
|    agent/rollout/ep_rew_wrapped_mean | -63.4       |
|    agent/time/fps                    | 4143        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 20480       |
|    agent/train/approx_kl             | 0.005142475 |
|    agent/train/clip_fraction         | 0.45        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.665      |
|    agent/train/explained_variance    | 0.819       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00017    |
|    agent/train/n_updates             | 90          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.7         |
|    agent/rollout/ep_rew_wrapped_mean | -63.6        |
|    agent/time/fps                    | 3709         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0052266833 |
|    agent/train/clip_fraction         | 0.462        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.662       |
|    agent/train/explained_variance    | 0.797        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0286      |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18           |
|    agent/rollout/ep_rew_wrapped_mean | -63.3        |
|    agent/time/fps                    | 4129         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0039762724 |
|    agent/train/clip_fraction         | 0.296        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.663       |
|    agent/train/explained_variance    | 0.754        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0185      |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.3         |
|    agent/rollout/ep_rew_wrapped_mean | -63          |
|    agent/time/fps                    | 3891         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 26624        |
|    agent/train/approx_kl             | 0.0019062645 |
|    agent/train/clip_fraction         | 0.0834       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.667       |
|    agent/train/explained_variance    | 0.832        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00374     |
|    agent/train/n_updates             | 120          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.4         |
|    agent/rollout/ep_rew_wrapped_mean | -62.3        |
|    agent/time/fps                    | 3919         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 28672        |
|    agent/train/approx_kl             | 0.0031880857 |
|    agent/train/clip_fraction         | 0.186        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.668       |
|    agent/train/explained_variance    | 0.837        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.012        |
|    agent/train/n_updates             | 130          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.3         |
|    agent/rollout/ep_rew_wrapped_mean | -61.3        |
|    agent/time/fps                    | 4401         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0034206323 |
|    agent/train/clip_fraction         | 0.26         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.671       |
|    agent/train/explained_variance    | 0.833        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0248      |
|    agent/train/n_updates             | 140          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.2         |
|    agent/rollout/ep_rew_wrapped_mean | -59.9        |
|    agent/time/fps                    | 4477         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0028555375 |
|    agent/train/clip_fraction         | 0.171        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.672       |
|    agent/train/explained_variance    | 0.888        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0185      |
|    agent/train/n_updates             | 150          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.2         |
|    agent/rollout/ep_rew_wrapped_mean | -59          |
|    agent/time/fps                    | 4438         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0035798685 |
|    agent/train/clip_fraction         | 0.217        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.663       |
|    agent/train/explained_variance    | 0.924        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0188      |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 17.9        |
|    agent/rollout/ep_rew_wrapped_mean | -57.8       |
|    agent/time/fps                    | 3624        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 36864       |
|    agent/train/approx_kl             | 0.003965473 |
|    agent/train/clip_fraction         | 0.198       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.651      |
|    agent/train/explained_variance    | 0.924       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0246     |
|    agent/train/n_updates             | 170         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17.9         |
|    agent/rollout/ep_rew_wrapped_mean | -56.8        |
|    agent/time/fps                    | 3691         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 38912        |
|    agent/train/approx_kl             | 0.0043469574 |
|    agent/train/clip_fraction         | 0.207        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.641       |
|    agent/train/explained_variance    | 0.938        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0192      |
|    agent/train/n_updates             | 180          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17.9         |
|    agent/rollout/ep_rew_wrapped_mean | -59.7        |
|    agent/time/fps                    | 4315         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 40960        |
|    agent/train/approx_kl             | 0.0053193057 |
|    agent/train/clip_fraction         | 0.207        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.629       |
|    agent/train/explained_variance    | 0.734        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0202      |
|    agent/train/n_updates             | 190          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 17.5        |
|    agent/rollout/ep_rew_wrapped_mean | -57.7       |
|    agent/time/fps                    | 4430        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 43008       |
|    agent/train/approx_kl             | 0.006237668 |
|    agent/train/clip_fraction         | 0.271       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.595      |
|    agent/train/explained_variance    | 0.831       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00225     |
|    agent/train/n_updates             | 200         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 17.2        |
|    agent/rollout/ep_rew_wrapped_mean | -56.4       |
|    agent/time/fps                    | 4368        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 45056       |
|    agent/train/approx_kl             | 0.006196351 |
|    agent/train/clip_fraction         | 0.238       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.571      |
|    agent/train/explained_variance    | 0.695       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00558    |
|    agent/train/n_updates             | 210         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.8         |
|    agent/rollout/ep_rew_wrapped_mean | -55.4        |
|    agent/time/fps                    | 4449         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 47104        |
|    agent/train/approx_kl             | 0.0048062494 |
|    agent/train/clip_fraction         | 0.194        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.555       |
|    agent/train/explained_variance    | 0.717        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00252     |
|    agent/train/n_updates             | 220          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.5         |
|    agent/rollout/ep_rew_wrapped_mean | -54.3        |
|    agent/time/fps                    | 3791         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 49152        |
|    agent/train/approx_kl             | 0.0047297273 |
|    agent/train/clip_fraction         | 0.162        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.536       |
|    agent/train/explained_variance    | 0.925        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0121      |
|    agent/train/n_updates             | 230          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.1         |
|    agent/rollout/ep_rew_wrapped_mean | -53.4        |
|    agent/time/fps                    | 4052         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 51200        |
|    agent/train/approx_kl             | 0.0063911444 |
|    agent/train/clip_fraction         | 0.181        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.522       |
|    agent/train/explained_variance    | 0.933        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0179      |
|    agent/train/n_updates             | 240          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.2         |
|    agent/rollout/ep_rew_wrapped_mean | -52.9        |
|    agent/time/fps                    | 2599         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 53248        |
|    agent/train/approx_kl             | 0.0038513704 |
|    agent/train/clip_fraction         | 0.181        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.521       |
|    agent/train/explained_variance    | 0.919        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.014       |
|    agent/train/n_updates             | 250          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.6        |
|    agent/rollout/ep_rew_wrapped_mean | -52.1       |
|    agent/time/fps                    | 3762        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 55296       |
|    agent/train/approx_kl             | 0.004083424 |
|    agent/train/clip_fraction         | 0.218       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.516      |
|    agent/train/explained_variance    | 0.937       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0169     |
|    agent/train/n_updates             | 260         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.1        |
|    agent/rollout/ep_rew_wrapped_mean | -51.3       |
|    agent/time/fps                    | 4010        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 57344       |
|    agent/train/approx_kl             | 0.006013398 |
|    agent/train/clip_fraction         | 0.23        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.514      |
|    agent/train/explained_variance    | 0.915       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0152     |
|    agent/train/n_updates             | 270         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 13.4         |
|    agent/rollout/ep_rew_wrapped_mean | -50.5        |
|    agent/time/fps                    | 3990         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 59392        |
|    agent/train/approx_kl             | 0.0039072237 |
|    agent/train/clip_fraction         | 0.197        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.519       |
|    agent/train/explained_variance    | 0.927        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00929      |
|    agent/train/n_updates             | 280          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 13.2        |
|    agent/rollout/ep_rew_wrapped_mean | -50         |
|    agent/time/fps                    | 3480        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 61440       |
|    agent/train/approx_kl             | 0.007581149 |
|    agent/train/clip_fraction         | 0.291       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.533      |
|    agent/train/explained_variance    | 0.935       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00261    |
|    agent/train/n_updates             | 290         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 13.1        |
|    agent/rollout/ep_rew_wrapped_mean | -49.5       |
|    agent/time/fps                    | 3987        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 63488       |
|    agent/train/approx_kl             | 0.004744156 |
|    agent/train/clip_fraction         | 0.236       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.554      |
|    agent/train/explained_variance    | 0.929       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0163      |
|    agent/train/n_updates             | 300         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 12.9         |
|    agent/rollout/ep_rew_wrapped_mean | -49.3        |
|    agent/time/fps                    | 4172         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 65536        |
|    agent/train/approx_kl             | 0.0040129167 |
|    agent/train/clip_fraction         | 0.189        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.549       |
|    agent/train/explained_variance    | 0.861        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00393     |
|    agent/train/n_updates             | 310          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 12.7         |
|    agent/rollout/ep_rew_wrapped_mean | -49.2        |
|    agent/time/fps                    | 4179         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 67584        |
|    agent/train/approx_kl             | 0.0058786236 |
|    agent/train/clip_fraction         | 0.219        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.554       |
|    agent/train/explained_variance    | 0.95         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0179      |
|    agent/train/n_updates             | 320          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 12.8         |
|    agent/rollout/ep_rew_wrapped_mean | -49.5        |
|    agent/time/fps                    | 3844         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 69632        |
|    agent/train/approx_kl             | 0.0043511195 |
|    agent/train/clip_fraction         | 0.157        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.536       |
|    agent/train/explained_variance    | 0.862        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.012       |
|    agent/train/n_updates             | 330          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 12.8        |
|    agent/rollout/ep_rew_wrapped_mean | -50.1       |
|    agent/time/fps                    | 2679        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 71680       |
|    agent/train/approx_kl             | 0.006651368 |
|    agent/train/clip_fraction         | 0.273       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.552      |
|    agent/train/explained_variance    | 0.964       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0224     |
|    agent/train/n_updates             | 340         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 13.6         |
|    agent/rollout/ep_rew_wrapped_mean | -50.9        |
|    agent/time/fps                    | 3444         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 73728        |
|    agent/train/approx_kl             | 0.0032999557 |
|    agent/train/clip_fraction         | 0.154        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.509       |
|    agent/train/explained_variance    | 0.856        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0192      |
|    agent/train/n_updates             | 350          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.5         |
|    agent/rollout/ep_rew_wrapped_mean | -51.1        |
|    agent/time/fps                    | 4132         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 75776        |
|    agent/train/approx_kl             | 0.0067639416 |
|    agent/train/clip_fraction         | 0.197        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.503       |
|    agent/train/explained_variance    | 0.907        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00594      |
|    agent/train/n_updates             | 360          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.4        |
|    agent/rollout/ep_rew_wrapped_mean | -51.6       |
|    agent/time/fps                    | 3869        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 77824       |
|    agent/train/approx_kl             | 0.004570512 |
|    agent/train/clip_fraction         | 0.19        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.521      |
|    agent/train/explained_variance    | 0.968       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0083     |
|    agent/train/n_updates             | 370         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.3         |
|    agent/rollout/ep_rew_wrapped_mean | -52          |
|    agent/time/fps                    | 3031         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 79872        |
|    agent/train/approx_kl             | 0.0065683154 |
|    agent/train/clip_fraction         | 0.222        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.521       |
|    agent/train/explained_variance    | 0.962        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0082       |
|    agent/train/n_updates             | 380          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.7        |
|    agent/rollout/ep_rew_wrapped_mean | -52         |
|    agent/time/fps                    | 4254        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 81920       |
|    agent/train/approx_kl             | 0.004348727 |
|    agent/train/clip_fraction         | 0.205       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.51       |
|    agent/train/explained_variance    | 0.972       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0129     |
|    agent/train/n_updates             | 390         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.9         |
|    agent/rollout/ep_rew_wrapped_mean | -51.6        |
|    agent/time/fps                    | 3220         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 83968        |
|    agent/train/approx_kl             | 0.0069202147 |
|    agent/train/clip_fraction         | 0.19         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.47        |
|    agent/train/explained_variance    | 0.972        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.022       |
|    agent/train/n_updates             | 400          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15           |
|    agent/rollout/ep_rew_wrapped_mean | -50.8        |
|    agent/time/fps                    | 3912         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 86016        |
|    agent/train/approx_kl             | 0.0053977016 |
|    agent/train/clip_fraction         | 0.183        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.488       |
|    agent/train/explained_variance    | 0.986        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0228      |
|    agent/train/n_updates             | 410          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.1         |
|    agent/rollout/ep_rew_wrapped_mean | -49.7        |
|    agent/time/fps                    | 4343         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 88064        |
|    agent/train/approx_kl             | 0.0045186146 |
|    agent/train/clip_fraction         | 0.199        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.474       |
|    agent/train/explained_variance    | 0.973        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0138      |
|    agent/train/n_updates             | 420          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.2         |
|    agent/rollout/ep_rew_wrapped_mean | -48.8        |
|    agent/time/fps                    | 3967         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 90112        |
|    agent/train/approx_kl             | 0.0037981456 |
|    agent/train/clip_fraction         | 0.183        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.476       |
|    agent/train/explained_variance    | 0.985        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.013        |
|    agent/train/n_updates             | 430          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.4         |
|    agent/rollout/ep_rew_wrapped_mean | -48.1        |
|    agent/time/fps                    | 3661         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 92160        |
|    agent/train/approx_kl             | 0.0035728505 |
|    agent/train/clip_fraction         | 0.172        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.477       |
|    agent/train/explained_variance    | 0.986        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00108     |
|    agent/train/n_updates             | 440          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.9         |
|    agent/rollout/ep_rew_wrapped_mean | -47.1        |
|    agent/time/fps                    | 3983         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 94208        |
|    agent/train/approx_kl             | 0.0046197716 |
|    agent/train/clip_fraction         | 0.191        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.49        |
|    agent/train/explained_variance    | 0.99         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00284     |
|    agent/train/n_updates             | 450          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.9         |
|    agent/rollout/ep_rew_wrapped_mean | -45.8        |
|    agent/time/fps                    | 3821         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 96256        |
|    agent/train/approx_kl             | 0.0034839665 |
|    agent/train/clip_fraction         | 0.188        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.465       |
|    agent/train/explained_variance    | 0.99         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00994      |
|    agent/train/n_updates             | 460          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.2         |
|    agent/rollout/ep_rew_wrapped_mean | -44.5        |
|    agent/time/fps                    | 3924         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 98304        |
|    agent/train/approx_kl             | 0.0067098327 |
|    agent/train/clip_fraction         | 0.228        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.47        |
|    agent/train/explained_variance    | 0.989        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00973     |
|    agent/train/n_updates             | 470          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.4         |
|    agent/rollout/ep_rew_wrapped_mean | -43.2        |
|    agent/time/fps                    | 3486         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 100352       |
|    agent/train/approx_kl             | 0.0057433206 |
|    agent/train/clip_fraction         | 0.211        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.47        |
|    agent/train/explained_variance    | 0.993        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00573     |
|    agent/train/n_updates             | 480          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.7         |
|    agent/rollout/ep_rew_wrapped_mean | -41.9        |
|    agent/time/fps                    | 3408         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 102400       |
|    agent/train/approx_kl             | 0.0055612335 |
|    agent/train/clip_fraction         | 0.209        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.465       |
|    agent/train/explained_variance    | 0.992        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00234      |
|    agent/train/n_updates             | 490          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.8         |
|    agent/rollout/ep_rew_wrapped_mean | -40.6        |
|    agent/time/fps                    | 3891         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 104448       |
|    agent/train/approx_kl             | 0.0049428632 |
|    agent/train/clip_fraction         | 0.172        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.438       |
|    agent/train/explained_variance    | 0.99         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00889     |
|    agent/train/n_updates             | 500          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17.1         |
|    agent/rollout/ep_rew_wrapped_mean | -39          |
|    agent/time/fps                    | 4405         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 106496       |
|    agent/train/approx_kl             | 0.0050376575 |
|    agent/train/clip_fraction         | 0.185        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.436       |
|    agent/train/explained_variance    | 0.994        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0025       |
|    agent/train/n_updates             | 510          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 17.1        |
|    agent/rollout/ep_rew_wrapped_mean | -37.3       |
|    agent/time/fps                    | 4142        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 108544      |
|    agent/train/approx_kl             | 0.005857031 |
|    agent/train/clip_fraction         | 0.16        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.406      |
|    agent/train/explained_variance    | 0.99        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00124     |
|    agent/train/n_updates             | 520         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 17.2        |
|    agent/rollout/ep_rew_wrapped_mean | -35.7       |
|    agent/time/fps                    | 4110        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 110592      |
|    agent/train/approx_kl             | 0.009719808 |
|    agent/train/clip_fraction         | 0.212       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.397      |
|    agent/train/explained_variance    | 0.992       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0172      |
|    agent/train/n_updates             | 530         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 17.2        |
|    agent/rollout/ep_rew_wrapped_mean | -34.2       |
|    agent/time/fps                    | 4170        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 112640      |
|    agent/train/approx_kl             | 0.005481859 |
|    agent/train/clip_fraction         | 0.138       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.369      |
|    agent/train/explained_variance    | 0.992       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00596     |
|    agent/train/n_updates             | 540         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17.3         |
|    agent/rollout/ep_rew_wrapped_mean | -33          |
|    agent/time/fps                    | 4071         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 114688       |
|    agent/train/approx_kl             | 0.0076811756 |
|    agent/train/clip_fraction         | 0.185        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.363       |
|    agent/train/explained_variance    | 0.995        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0298      |
|    agent/train/n_updates             | 550          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 17.5        |
|    agent/rollout/ep_rew_wrapped_mean | -32.2       |
|    agent/time/fps                    | 4045        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 116736      |
|    agent/train/approx_kl             | 0.005894609 |
|    agent/train/clip_fraction         | 0.144       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.369      |
|    agent/train/explained_variance    | 0.994       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00423    |
|    agent/train/n_updates             | 560         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 17.8        |
|    agent/rollout/ep_rew_wrapped_mean | -31.8       |
|    agent/time/fps                    | 3869        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 118784      |
|    agent/train/approx_kl             | 0.015070837 |
|    agent/train/clip_fraction         | 0.163       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.353      |
|    agent/train/explained_variance    | 0.995       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0151     |
|    agent/train/n_updates             | 570         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 17.7        |
|    agent/rollout/ep_rew_wrapped_mean | -31.6       |
|    agent/time/fps                    | 4230        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 120832      |
|    agent/train/approx_kl             | 0.014555635 |
|    agent/train/clip_fraction         | 0.242       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.373      |
|    agent/train/explained_variance    | 0.995       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0167     |
|    agent/train/n_updates             | 580         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 103 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 17.9        |
|    agent/rollout/ep_rew_wrapped_mean | -31.5       |
|    agent/time/fps                    | 3688        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 122880      |
|    agent/train/approx_kl             | 0.012369591 |
|    agent/train/clip_fraction         | 0.209       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.404      |
|    agent/train/explained_variance    | 0.982       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0235      |
|    agent/train/n_updates             | 590         |
|    agent/train/policy_gradient

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17.4         |
|    agent/rollout/ep_rew_wrapped_mean | -31.4        |
|    agent/time/fps                    | 4256         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 124928       |
|    agent/train/approx_kl             | 0.0064764814 |
|    agent/train/clip_fraction         | 0.189        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.378       |
|    agent/train/explained_variance    | 0.911        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00794     |
|    agent/train/n_updates             | 600          |
|    agent/train



VBox(children=(Label(value='0.155 MB of 0.155 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▇█
time/fps,█▅▄▃▃▂▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
train/approx_kl,▂▁▂▃▂▁▄▃▁▂▃▄▅▂▃▃▂▂▂▂▃▃▅▄▄▆▃▂▃▄▃▆▆█▄█▄▃▄▃
train/clip_fraction,▄▃▅▁▃▂█▅▂▃▆▄▆▁▃▄▁▃▃▂▃▃▂▃▄▃▁▂▃▄▂▂▄▄▃▇▃▃▄▃
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▁▂▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇▆▆▇▇▇▇▇▇▇▇▇▇██▇▇▇▇██▇▇
train/explained_variance,▁▇▇▁█▇██████████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,187.17999
time/fps,2674.0
train/approx_kl,0.0051
train/clip_fraction,0.15156
train/clip_range,0.1
train/entropy_loss,-0.30987
train/explained_variance,0.99837
train/learning_rate,0.002


 60%|██████████████████████████▍                 | 3/5 [14:38<09:51, 295.51s/it]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011136287499943541, max=1.0…

Query schedule: [10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Collecting 20 fragments (4000 transitions)
Requested 2400 transitions but only 0 in buffer. Sampling 2400 additional transitions.
Sampling 1600 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 10 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 33.5     |
|    agent/rollout/ep_rew_wrapped_mean | 276      |
|    agent/time/fps                    | 2502     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 33.5     |
|    agent/rollout/ep_rew_wrapped_mean | 276      |
|    agent/time/fps                    | 2.5e+03  |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 34.6         |
|    agent/rollout/ep_rew_wrapped_mean | 158          |
|    agent/time/fps                    | 1241         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 1            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0026576938 |
|    agent/train/clip_fraction         | 0.0861       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.691       |
|    agent/train/explained_variance    | 0.0261       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.04         |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.2         |
|    agent/rollout/ep_rew_wrapped_mean | 112          |
|    agent/time/fps                    | 3895         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0033167112 |
|    agent/train/clip_fraction         | 0.233        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.682       |
|    agent/train/explained_variance    | 0.738        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0208       |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29.7        |
|    agent/rollout/ep_rew_wrapped_mean | 90          |
|    agent/time/fps                    | 4359        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 8192        |
|    agent/train/approx_kl             | 0.002408796 |
|    agent/train/clip_fraction         | 0.0997      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.679      |
|    agent/train/explained_variance    | 0.816       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0032      |
|    agent/train/n_updates             | 30          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.8         |
|    agent/rollout/ep_rew_wrapped_mean | 71.5         |
|    agent/time/fps                    | 4387         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0018956545 |
|    agent/train/clip_fraction         | 0.0545       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.672       |
|    agent/train/explained_variance    | -0.324       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0173       |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.7         |
|    agent/rollout/ep_rew_wrapped_mean | 52.4         |
|    agent/time/fps                    | 4298         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0027391538 |
|    agent/train/clip_fraction         | 0.0664       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.669       |
|    agent/train/explained_variance    | -0.17        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0797       |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.4         |
|    agent/rollout/ep_rew_wrapped_mean | 38.3         |
|    agent/time/fps                    | 4373         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0030030068 |
|    agent/train/clip_fraction         | 0.113        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.674       |
|    agent/train/explained_variance    | 0.726        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0179       |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.3         |
|    agent/rollout/ep_rew_wrapped_mean | 27.9         |
|    agent/time/fps                    | 4415         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0019352635 |
|    agent/train/clip_fraction         | 0.0667       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.669       |
|    agent/train/explained_variance    | 0.808        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.027        |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29.8        |
|    agent/rollout/ep_rew_wrapped_mean | 19.8        |
|    agent/time/fps                    | 4467        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 18432       |
|    agent/train/approx_kl             | 0.003627287 |
|    agent/train/clip_fraction         | 0.173       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.676      |
|    agent/train/explained_variance    | 0.91        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00311    |
|    agent/train/n_updates             | 80          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.5         |
|    agent/rollout/ep_rew_wrapped_mean | 14.1         |
|    agent/time/fps                    | 4400         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0018610756 |
|    agent/train/clip_fraction         | 0.0862       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.672       |
|    agent/train/explained_variance    | 0.526        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0589       |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.1         |
|    agent/rollout/ep_rew_wrapped_mean | 9.76         |
|    agent/time/fps                    | 4346         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0035321484 |
|    agent/train/clip_fraction         | 0.191        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.669       |
|    agent/train/explained_variance    | 0.952        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00328      |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 30.5       |
|    agent/rollout/ep_rew_wrapped_mean | 7.61       |
|    agent/time/fps                    | 4348       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 24576      |
|    agent/train/approx_kl             | 0.00461278 |
|    agent/train/clip_fraction         | 0.38       |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.667     |
|    agent/train/explained_variance    | 0.971      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.0115    |
|    agent/train/n_updates             | 110        |
|    agent/train/policy_gradient_loss  | -0.0174  

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.9         |
|    agent/rollout/ep_rew_wrapped_mean | 2.26         |
|    agent/time/fps                    | 4430         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 26624        |
|    agent/train/approx_kl             | 0.0021989031 |
|    agent/train/clip_fraction         | 0.0965       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.652       |
|    agent/train/explained_variance    | 0.122        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.395        |
|    agent/train/n_updates             | 120          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.3         |
|    agent/rollout/ep_rew_wrapped_mean | -2.75        |
|    agent/time/fps                    | 4246         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 28672        |
|    agent/train/approx_kl             | 0.0015223657 |
|    agent/train/clip_fraction         | 0.062        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.658       |
|    agent/train/explained_variance    | 0.769        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0409       |
|    agent/train/n_updates             | 130          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.2         |
|    agent/rollout/ep_rew_wrapped_mean | -4.84        |
|    agent/time/fps                    | 4248         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0018910093 |
|    agent/train/clip_fraction         | 0.0727       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.668       |
|    agent/train/explained_variance    | 0.983        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0152      |
|    agent/train/n_updates             | 140          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29           |
|    agent/rollout/ep_rew_wrapped_mean | -5.61        |
|    agent/time/fps                    | 2660         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0022391237 |
|    agent/train/clip_fraction         | 0.16         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.662       |
|    agent/train/explained_variance    | 0.986        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00843     |
|    agent/train/n_updates             | 150          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.8         |
|    agent/rollout/ep_rew_wrapped_mean | -4.78        |
|    agent/time/fps                    | 3186         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0040562684 |
|    agent/train/clip_fraction         | 0.172        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.642       |
|    agent/train/explained_variance    | 0.683        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.034        |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 28.9        |
|    agent/rollout/ep_rew_wrapped_mean | -4.67       |
|    agent/time/fps                    | 4418        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 36864       |
|    agent/train/approx_kl             | 0.004009418 |
|    agent/train/clip_fraction         | 0.149       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.612      |
|    agent/train/explained_variance    | 0.834       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00898     |
|    agent/train/n_updates             | 170         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29           |
|    agent/rollout/ep_rew_wrapped_mean | -4.42        |
|    agent/time/fps                    | 4189         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 38912        |
|    agent/train/approx_kl             | 0.0039157234 |
|    agent/train/clip_fraction         | 0.16         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.634       |
|    agent/train/explained_variance    | 0.725        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0428       |
|    agent/train/n_updates             | 180          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.7         |
|    agent/rollout/ep_rew_wrapped_mean | -18.5        |
|    agent/time/fps                    | 4244         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 40960        |
|    agent/train/approx_kl             | 0.0016664024 |
|    agent/train/clip_fraction         | 0.0787       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.616       |
|    agent/train/explained_variance    | 0.815        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00591     |
|    agent/train/n_updates             | 190          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.3         |
|    agent/rollout/ep_rew_wrapped_mean | -28.5        |
|    agent/time/fps                    | 3996         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0027457331 |
|    agent/train/clip_fraction         | 0.118        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.586       |
|    agent/train/explained_variance    | 0.866        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00488      |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.2         |
|    agent/rollout/ep_rew_wrapped_mean | -25.5        |
|    agent/time/fps                    | 2634         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 45056        |
|    agent/train/approx_kl             | 0.0030518626 |
|    agent/train/clip_fraction         | 0.124        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.596       |
|    agent/train/explained_variance    | 0.911        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00067     |
|    agent/train/n_updates             | 210          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29.3        |
|    agent/rollout/ep_rew_wrapped_mean | -23.7       |
|    agent/time/fps                    | 2341        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 47104       |
|    agent/train/approx_kl             | 0.005535418 |
|    agent/train/clip_fraction         | 0.195       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.573      |
|    agent/train/explained_variance    | 0.934       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00967    |
|    agent/train/n_updates             | 220         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29.2        |
|    agent/rollout/ep_rew_wrapped_mean | -23.2       |
|    agent/time/fps                    | 921         |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 2           |
|    agent/time/total_timesteps        | 49152       |
|    agent/train/approx_kl             | 0.005924904 |
|    agent/train/clip_fraction         | 0.177       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.536      |
|    agent/train/explained_variance    | 0.956       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0129     |
|    agent/train/n_updates             | 230         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29          |
|    agent/rollout/ep_rew_wrapped_mean | -20.8       |
|    agent/time/fps                    | 3761        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 51200       |
|    agent/train/approx_kl             | 0.004554637 |
|    agent/train/clip_fraction         | 0.173       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.544      |
|    agent/train/explained_variance    | 0.934       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00365     |
|    agent/train/n_updates             | 240         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29.4        |
|    agent/rollout/ep_rew_wrapped_mean | -15.8       |
|    agent/time/fps                    | 4018        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 53248       |
|    agent/train/approx_kl             | 0.004369812 |
|    agent/train/clip_fraction         | 0.193       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.542      |
|    agent/train/explained_variance    | 0.974       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0122     |
|    agent/train/n_updates             | 250         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29.8        |
|    agent/rollout/ep_rew_wrapped_mean | -10.7       |
|    agent/time/fps                    | 3364        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 55296       |
|    agent/train/approx_kl             | 0.004295982 |
|    agent/train/clip_fraction         | 0.164       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.524      |
|    agent/train/explained_variance    | 0.912       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0503      |
|    agent/train/n_updates             | 260         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30           |
|    agent/rollout/ep_rew_wrapped_mean | -5.48        |
|    agent/time/fps                    | 2872         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 57344        |
|    agent/train/approx_kl             | 0.0038493099 |
|    agent/train/clip_fraction         | 0.141        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.519       |
|    agent/train/explained_variance    | 0.926        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00929     |
|    agent/train/n_updates             | 270          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30           |
|    agent/rollout/ep_rew_wrapped_mean | -1.4         |
|    agent/time/fps                    | 1340         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 1            |
|    agent/time/total_timesteps        | 59392        |
|    agent/train/approx_kl             | 0.0033186064 |
|    agent/train/clip_fraction         | 0.115        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.511       |
|    agent/train/explained_variance    | 0.951        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00534     |
|    agent/train/n_updates             | 280          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31           |
|    agent/rollout/ep_rew_wrapped_mean | 1.54         |
|    agent/time/fps                    | 3691         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 61440        |
|    agent/train/approx_kl             | 0.0025212634 |
|    agent/train/clip_fraction         | 0.122        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.497       |
|    agent/train/explained_variance    | 0.979        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0151       |
|    agent/train/n_updates             | 290          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.3         |
|    agent/rollout/ep_rew_wrapped_mean | 4.5          |
|    agent/time/fps                    | 1199         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 1            |
|    agent/time/total_timesteps        | 63488        |
|    agent/train/approx_kl             | 0.0040490637 |
|    agent/train/clip_fraction         | 0.13         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.516       |
|    agent/train/explained_variance    | 0.982        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0168       |
|    agent/train/n_updates             | 300          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.1         |
|    agent/rollout/ep_rew_wrapped_mean | 5.89         |
|    agent/time/fps                    | 3309         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 65536        |
|    agent/train/approx_kl             | 0.0044033183 |
|    agent/train/clip_fraction         | 0.123        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.516       |
|    agent/train/explained_variance    | 0.971        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00131     |
|    agent/train/n_updates             | 310          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.1         |
|    agent/rollout/ep_rew_wrapped_mean | 9.07         |
|    agent/time/fps                    | 4329         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 67584        |
|    agent/train/approx_kl             | 0.0049343957 |
|    agent/train/clip_fraction         | 0.116        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.467       |
|    agent/train/explained_variance    | 0.985        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0241       |
|    agent/train/n_updates             | 320          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.8         |
|    agent/rollout/ep_rew_wrapped_mean | 11.2         |
|    agent/time/fps                    | 4364         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 69632        |
|    agent/train/approx_kl             | 0.0052385065 |
|    agent/train/clip_fraction         | 0.139        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.468       |
|    agent/train/explained_variance    | 0.937        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0395       |
|    agent/train/n_updates             | 330          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.8         |
|    agent/rollout/ep_rew_wrapped_mean | 11.3         |
|    agent/time/fps                    | 4248         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 71680        |
|    agent/train/approx_kl             | 0.0018263644 |
|    agent/train/clip_fraction         | 0.0825       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.465       |
|    agent/train/explained_variance    | 0.981        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0122       |
|    agent/train/n_updates             | 340          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 30.4        |
|    agent/rollout/ep_rew_wrapped_mean | 12.9        |
|    agent/time/fps                    | 4320        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 73728       |
|    agent/train/approx_kl             | 0.009688974 |
|    agent/train/clip_fraction         | 0.228       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.428      |
|    agent/train/explained_variance    | 0.988       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00294    |
|    agent/train/n_updates             | 350         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.4         |
|    agent/rollout/ep_rew_wrapped_mean | 13.3         |
|    agent/time/fps                    | 3737         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 75776        |
|    agent/train/approx_kl             | 0.0106572285 |
|    agent/train/clip_fraction         | 0.181        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.428       |
|    agent/train/explained_variance    | 0.984        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0166       |
|    agent/train/n_updates             | 360          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 30.9       |
|    agent/rollout/ep_rew_wrapped_mean | 14.3       |
|    agent/time/fps                    | 2807       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 77824      |
|    agent/train/approx_kl             | 0.00781309 |
|    agent/train/clip_fraction         | 0.121      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.325     |
|    agent/train/explained_variance    | 0.989      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.0197     |
|    agent/train/n_updates             | 370        |
|    agent/train/policy_gradient_loss  | -0.00463 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.2         |
|    agent/rollout/ep_rew_wrapped_mean | 15.2         |
|    agent/time/fps                    | 4126         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 79872        |
|    agent/train/approx_kl             | 0.0027378837 |
|    agent/train/clip_fraction         | 0.108        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.35        |
|    agent/train/explained_variance    | 0.989        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00956      |
|    agent/train/n_updates             | 380          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.3         |
|    agent/rollout/ep_rew_wrapped_mean | 16.4         |
|    agent/time/fps                    | 2559         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 81920        |
|    agent/train/approx_kl             | 0.0022756637 |
|    agent/train/clip_fraction         | 0.077        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.336       |
|    agent/train/explained_variance    | 0.984        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00726      |
|    agent/train/n_updates             | 390          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.7         |
|    agent/rollout/ep_rew_wrapped_mean | 19.1         |
|    agent/time/fps                    | 1999         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 1            |
|    agent/time/total_timesteps        | 83968        |
|    agent/train/approx_kl             | 0.0014206266 |
|    agent/train/clip_fraction         | 0.0462       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.277       |
|    agent/train/explained_variance    | 0.827        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0345       |
|    agent/train/n_updates             | 400          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 32.4        |
|    agent/rollout/ep_rew_wrapped_mean | 21.5        |
|    agent/time/fps                    | 3501        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 86016       |
|    agent/train/approx_kl             | 0.004021532 |
|    agent/train/clip_fraction         | 0.135       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.503      |
|    agent/train/explained_variance    | 0.903       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0184      |
|    agent/train/n_updates             | 410         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.5         |
|    agent/rollout/ep_rew_wrapped_mean | 23           |
|    agent/time/fps                    | 2485         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 88064        |
|    agent/train/approx_kl             | 0.0034764628 |
|    agent/train/clip_fraction         | 0.0695       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.261       |
|    agent/train/explained_variance    | 0.919        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0263       |
|    agent/train/n_updates             | 420          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.4         |
|    agent/rollout/ep_rew_wrapped_mean | 24.1         |
|    agent/time/fps                    | 3486         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 90112        |
|    agent/train/approx_kl             | 0.0033768318 |
|    agent/train/clip_fraction         | 0.105        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.35        |
|    agent/train/explained_variance    | 0.961        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00655      |
|    agent/train/n_updates             | 430          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.5         |
|    agent/rollout/ep_rew_wrapped_mean | 24.7         |
|    agent/time/fps                    | 3344         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 92160        |
|    agent/train/approx_kl             | 0.0040625855 |
|    agent/train/clip_fraction         | 0.15         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.442       |
|    agent/train/explained_variance    | 0.971        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0336       |
|    agent/train/n_updates             | 440          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 31.7        |
|    agent/rollout/ep_rew_wrapped_mean | 23.9        |
|    agent/time/fps                    | 2970        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 94208       |
|    agent/train/approx_kl             | 0.001438329 |
|    agent/train/clip_fraction         | 0.0647      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.238      |
|    agent/train/explained_variance    | 0.996       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00817     |
|    agent/train/n_updates             | 450         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 31.7        |
|    agent/rollout/ep_rew_wrapped_mean | 23          |
|    agent/time/fps                    | 3014        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 96256       |
|    agent/train/approx_kl             | 0.003950083 |
|    agent/train/clip_fraction         | 0.115       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.318      |
|    agent/train/explained_variance    | 0.994       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00895     |
|    agent/train/n_updates             | 460         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.3         |
|    agent/rollout/ep_rew_wrapped_mean | 21.8         |
|    agent/time/fps                    | 3903         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 98304        |
|    agent/train/approx_kl             | 0.0012719773 |
|    agent/train/clip_fraction         | 0.0677       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.257       |
|    agent/train/explained_variance    | 0.978        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00339     |
|    agent/train/n_updates             | 470          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.5         |
|    agent/rollout/ep_rew_wrapped_mean | 21.3         |
|    agent/time/fps                    | 2562         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 100352       |
|    agent/train/approx_kl             | 0.0073018363 |
|    agent/train/clip_fraction         | 0.122        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.324       |
|    agent/train/explained_variance    | 0.995        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00853     |
|    agent/train/n_updates             | 480          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 31.8        |
|    agent/rollout/ep_rew_wrapped_mean | 22.1        |
|    agent/time/fps                    | 3418        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 102400      |
|    agent/train/approx_kl             | 0.003009013 |
|    agent/train/clip_fraction         | 0.0824      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.337      |
|    agent/train/explained_variance    | 0.985       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0038      |
|    agent/train/n_updates             | 490         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 31.4        |
|    agent/rollout/ep_rew_wrapped_mean | 22.1        |
|    agent/time/fps                    | 3099        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 104448      |
|    agent/train/approx_kl             | 0.013701334 |
|    agent/train/clip_fraction         | 0.256       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.362      |
|    agent/train/explained_variance    | 0.995       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0148     |
|    agent/train/n_updates             | 500         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 31.6        |
|    agent/rollout/ep_rew_wrapped_mean | 22.9        |
|    agent/time/fps                    | 4410        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 106496      |
|    agent/train/approx_kl             | 0.008088496 |
|    agent/train/clip_fraction         | 0.17        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.407      |
|    agent/train/explained_variance    | 0.997       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00702     |
|    agent/train/n_updates             | 510         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32           |
|    agent/rollout/ep_rew_wrapped_mean | 23.3         |
|    agent/time/fps                    | 3253         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 108544       |
|    agent/train/approx_kl             | 0.0067612636 |
|    agent/train/clip_fraction         | 0.161        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.447       |
|    agent/train/explained_variance    | 0.951        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0319       |
|    agent/train/n_updates             | 520          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.7         |
|    agent/rollout/ep_rew_wrapped_mean | 25.3         |
|    agent/time/fps                    | 4203         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 110592       |
|    agent/train/approx_kl             | 0.0072920844 |
|    agent/train/clip_fraction         | 0.256        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.511       |
|    agent/train/explained_variance    | 0.987        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0224      |
|    agent/train/n_updates             | 530          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32           |
|    agent/rollout/ep_rew_wrapped_mean | 28.3         |
|    agent/time/fps                    | 2351         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 112640       |
|    agent/train/approx_kl             | 0.0040886225 |
|    agent/train/clip_fraction         | 0.172        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.473       |
|    agent/train/explained_variance    | 0.97         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0333       |
|    agent/train/n_updates             | 540          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 32          |
|    agent/rollout/ep_rew_wrapped_mean | 28.2        |
|    agent/time/fps                    | 4220        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 114688      |
|    agent/train/approx_kl             | 0.009683057 |
|    agent/train/clip_fraction         | 0.171       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.464      |
|    agent/train/explained_variance    | 0.993       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0138     |
|    agent/train/n_updates             | 550         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.5         |
|    agent/rollout/ep_rew_wrapped_mean | 27.8         |
|    agent/time/fps                    | 3825         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 116736       |
|    agent/train/approx_kl             | 0.0036842003 |
|    agent/train/clip_fraction         | 0.14         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.5         |
|    agent/train/explained_variance    | 0.981        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00587     |
|    agent/train/n_updates             | 560          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 32.5       |
|    agent/rollout/ep_rew_wrapped_mean | 28         |
|    agent/time/fps                    | 2157       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 118784     |
|    agent/train/approx_kl             | 0.00494126 |
|    agent/train/clip_fraction         | 0.172      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.5       |
|    agent/train/explained_variance    | 0.974      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.00405    |
|    agent/train/n_updates             | 570        |
|    agent/train/policy_gradient_loss  | -0.000886

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 32.7        |
|    agent/rollout/ep_rew_wrapped_mean | 28.1        |
|    agent/time/fps                    | 3506        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 120832      |
|    agent/train/approx_kl             | 0.002527135 |
|    agent/train/clip_fraction         | 0.111       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.502      |
|    agent/train/explained_variance    | 0.989       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00818     |
|    agent/train/n_updates             | 580         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 103 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.8         |
|    agent/rollout/ep_rew_wrapped_mean | 27.3         |
|    agent/time/fps                    | 1157         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 1            |
|    agent/time/total_timesteps        | 122880       |
|    agent/train/approx_kl             | 0.0023861493 |
|    agent/train/clip_fraction         | 0.115        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.495       |
|    agent/train/explained_variance    | 0.99         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00203      |
|    agent/train/n_updates             | 590          |
|    agent/trai

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 33.6        |
|    agent/rollout/ep_rew_wrapped_mean | 25.4        |
|    agent/time/fps                    | 2455        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 124928      |
|    agent/train/approx_kl             | 0.002808573 |
|    agent/train/clip_fraction         | 0.142       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.512      |
|    agent/train/explained_variance    | 0.993       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00469    |
|    agent/train/n_updates             | 600         |
|    agent/train/policy_gradient_



VBox(children=(Label(value='0.112 MB of 0.112 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▃▁▁▁▁▁▁▂▂▄▇██▆▄▃▄▄▄▄▄▄▄▅▆▇▇▇▆▅▅▅▅▄▄▅▅▅▅▄
time/fps,█▁▂▂▃▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂
train/approx_kl,▁▂▂▁▂▃▁▁▁▂▂▂▁▁▂▂▂▂▂▂▂▄▃▂▁▁▂▃▂▂▂▂▃▂▃▂▅▃█▂
train/clip_fraction,▅█▆▅▃▄▂▂▂▆▅▂▃▂▂▂▃▁▁▃▃▂▃▂▃▂▃▂▂▂▂▁▃▂▃▂▂▁▂▂
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▃▅▆▆▇▇▇▅▆▇▇▇██▇██▇▇█▇▇▇█▇▇█▇██████████
train/explained_variance,▁▅███████▆██████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,38.18
time/fps,2859.0
train/approx_kl,0.00567
train/clip_fraction,0.09517
train/clip_range,0.1
train/entropy_loss,-0.13174
train/explained_variance,0.99935
train/learning_rate,0.002


 80%|███████████████████████████████████▏        | 4/5 [19:57<05:04, 304.54s/it]

Query schedule: [10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Collecting 20 fragments (4000 transitions)
Requested 2400 transitions but only 0 in buffer. Sampling 2400 additional transitions.
Sampling 1600 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 10 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 38.2     |
|    agent/rollout/ep_rew_wrapped_mean | 67       |
|    agent/time/fps                    | 4510     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 38.2     |
|    agent/rollout/ep_rew_wrapped_mean | 67       |
|    agent/time/fps                    | 4.51e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 35.4        |
|    agent/rollout/ep_rew_wrapped_mean | 22.1        |
|    agent/time/fps                    | 4247        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 4096        |
|    agent/train/approx_kl             | 0.002305199 |
|    agent/train/clip_fraction         | 0.0555      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.691      |
|    agent/train/explained_variance    | -0.371      |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0191      |
|    agent/train/n_updates             | 10          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.9         |
|    agent/rollout/ep_rew_wrapped_mean | 4.97         |
|    agent/time/fps                    | 4388         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0039109113 |
|    agent/train/clip_fraction         | 0.0978       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.685       |
|    agent/train/explained_variance    | 0.754        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0133      |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.1         |
|    agent/rollout/ep_rew_wrapped_mean | -11.7        |
|    agent/time/fps                    | 4121         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0024244555 |
|    agent/train/clip_fraction         | 0.0781       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.673       |
|    agent/train/explained_variance    | 0.56         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0461       |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.4         |
|    agent/rollout/ep_rew_wrapped_mean | -31.4        |
|    agent/time/fps                    | 2828         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0026447386 |
|    agent/train/clip_fraction         | 0.119        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.664       |
|    agent/train/explained_variance    | 0.803        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.319        |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.8         |
|    agent/rollout/ep_rew_wrapped_mean | -54.8        |
|    agent/time/fps                    | 4185         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0025881613 |
|    agent/train/clip_fraction         | 0.078        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.66        |
|    agent/train/explained_variance    | 0.913        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.195        |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 24          |
|    agent/rollout/ep_rew_wrapped_mean | -70         |
|    agent/time/fps                    | 4173        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 14336       |
|    agent/train/approx_kl             | 0.001202041 |
|    agent/train/clip_fraction         | 0.0696      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.654      |
|    agent/train/explained_variance    | 0.932       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.221       |
|    agent/train/n_updates             | 60          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.7         |
|    agent/rollout/ep_rew_wrapped_mean | -80.5        |
|    agent/time/fps                    | 4106         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0015440129 |
|    agent/train/clip_fraction         | 0.0694       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.662       |
|    agent/train/explained_variance    | 0.99         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0749       |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.9         |
|    agent/rollout/ep_rew_wrapped_mean | -89          |
|    agent/time/fps                    | 3770         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0029170373 |
|    agent/train/clip_fraction         | 0.197        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.67        |
|    agent/train/explained_variance    | 0.995        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00642      |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.1         |
|    agent/rollout/ep_rew_wrapped_mean | -94.5        |
|    agent/time/fps                    | 4355         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0025224914 |
|    agent/train/clip_fraction         | 0.15         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.669       |
|    agent/train/explained_variance    | 0.997        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0101      |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.5         |
|    agent/rollout/ep_rew_wrapped_mean | -99.9        |
|    agent/time/fps                    | 4343         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0027387198 |
|    agent/train/clip_fraction         | 0.182        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.67        |
|    agent/train/explained_variance    | 0.997        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0231       |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.6         |
|    agent/rollout/ep_rew_wrapped_mean | -102         |
|    agent/time/fps                    | 3920         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0052388916 |
|    agent/train/clip_fraction         | 0.38         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.677       |
|    agent/train/explained_variance    | 0.997        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00349      |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 20.7        |
|    agent/rollout/ep_rew_wrapped_mean | -102        |
|    agent/time/fps                    | 4360        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 26624       |
|    agent/train/approx_kl             | 0.004058931 |
|    agent/train/clip_fraction         | 0.186       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.678      |
|    agent/train/explained_variance    | 0.973       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00466     |
|    agent/train/n_updates             | 120         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 21.2        |
|    agent/rollout/ep_rew_wrapped_mean | -101        |
|    agent/time/fps                    | 4430        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 28672       |
|    agent/train/approx_kl             | 0.002531903 |
|    agent/train/clip_fraction         | 0.128       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.678      |
|    agent/train/explained_variance    | 0.854       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0249      |
|    agent/train/n_updates             | 130         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.8         |
|    agent/rollout/ep_rew_wrapped_mean | -97.3        |
|    agent/time/fps                    | 4434         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0022220942 |
|    agent/train/clip_fraction         | 0.104        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.666       |
|    agent/train/explained_variance    | 0.858        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0286       |
|    agent/train/n_updates             | 140          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.9         |
|    agent/rollout/ep_rew_wrapped_mean | -93.8        |
|    agent/time/fps                    | 4367         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0024220343 |
|    agent/train/clip_fraction         | 0.14         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.663       |
|    agent/train/explained_variance    | 0.79         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0182       |
|    agent/train/n_updates             | 150          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.5         |
|    agent/rollout/ep_rew_wrapped_mean | -91.4        |
|    agent/time/fps                    | 4374         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0031675925 |
|    agent/train/clip_fraction         | 0.132        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.641       |
|    agent/train/explained_variance    | 0.857        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0219       |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.5         |
|    agent/rollout/ep_rew_wrapped_mean | -89.1        |
|    agent/time/fps                    | 4140         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 36864        |
|    agent/train/approx_kl             | 0.0021959348 |
|    agent/train/clip_fraction         | 0.132        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.631       |
|    agent/train/explained_variance    | 0.892        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00013      |
|    agent/train/n_updates             | 170          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 20.2        |
|    agent/rollout/ep_rew_wrapped_mean | -87.2       |
|    agent/time/fps                    | 4320        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 38912       |
|    agent/train/approx_kl             | 0.003256977 |
|    agent/train/clip_fraction         | 0.198       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.619      |
|    agent/train/explained_variance    | 0.954       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00432    |
|    agent/train/n_updates             | 180         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19.8         |
|    agent/rollout/ep_rew_wrapped_mean | -90.3        |
|    agent/time/fps                    | 4448         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 40960        |
|    agent/train/approx_kl             | 0.0035081161 |
|    agent/train/clip_fraction         | 0.164        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.581       |
|    agent/train/explained_variance    | 0.908        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0166       |
|    agent/train/n_updates             | 190          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19.3         |
|    agent/rollout/ep_rew_wrapped_mean | -97.2        |
|    agent/time/fps                    | 4331         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0057088444 |
|    agent/train/clip_fraction         | 0.266        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.561       |
|    agent/train/explained_variance    | 0.973        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0121      |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.9         |
|    agent/rollout/ep_rew_wrapped_mean | -97          |
|    agent/time/fps                    | 4411         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 45056        |
|    agent/train/approx_kl             | 0.0035134805 |
|    agent/train/clip_fraction         | 0.18         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.549       |
|    agent/train/explained_variance    | 0.916        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00182     |
|    agent/train/n_updates             | 210          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 18.6        |
|    agent/rollout/ep_rew_wrapped_mean | -97.9       |
|    agent/time/fps                    | 4370        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 47104       |
|    agent/train/approx_kl             | 0.003756314 |
|    agent/train/clip_fraction         | 0.165       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.531      |
|    agent/train/explained_variance    | 0.938       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00296     |
|    agent/train/n_updates             | 220         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.2         |
|    agent/rollout/ep_rew_wrapped_mean | -97.4        |
|    agent/time/fps                    | 4146         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 49152        |
|    agent/train/approx_kl             | 0.0053447355 |
|    agent/train/clip_fraction         | 0.184        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.505       |
|    agent/train/explained_variance    | 0.948        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.000261    |
|    agent/train/n_updates             | 230          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 17.8        |
|    agent/rollout/ep_rew_wrapped_mean | -94.3       |
|    agent/time/fps                    | 4222        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 51200       |
|    agent/train/approx_kl             | 0.012986526 |
|    agent/train/clip_fraction         | 0.177       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.436      |
|    agent/train/explained_variance    | 0.982       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0247     |
|    agent/train/n_updates             | 240         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.6         |
|    agent/rollout/ep_rew_wrapped_mean | -88.3        |
|    agent/time/fps                    | 4308         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 53248        |
|    agent/train/approx_kl             | 0.0041864896 |
|    agent/train/clip_fraction         | 0.144        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.398       |
|    agent/train/explained_variance    | 0.949        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00145     |
|    agent/train/n_updates             | 250          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 15.7       |
|    agent/rollout/ep_rew_wrapped_mean | -83.4      |
|    agent/time/fps                    | 4390       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 55296      |
|    agent/train/approx_kl             | 0.00404191 |
|    agent/train/clip_fraction         | 0.142      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.388     |
|    agent/train/explained_variance    | 0.936      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.0135    |
|    agent/train/n_updates             | 260        |
|    agent/train/policy_gradient_loss  | -0.00392 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 15.1        |
|    agent/rollout/ep_rew_wrapped_mean | -79         |
|    agent/time/fps                    | 4084        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 57344       |
|    agent/train/approx_kl             | 0.004744824 |
|    agent/train/clip_fraction         | 0.137       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.372      |
|    agent/train/explained_variance    | 0.934       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00525    |
|    agent/train/n_updates             | 270         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.8        |
|    agent/rollout/ep_rew_wrapped_mean | -74.6       |
|    agent/time/fps                    | 4426        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 59392       |
|    agent/train/approx_kl             | 0.008044767 |
|    agent/train/clip_fraction         | 0.182       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.347      |
|    agent/train/explained_variance    | 0.965       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00476     |
|    agent/train/n_updates             | 280         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 14.6       |
|    agent/rollout/ep_rew_wrapped_mean | -71        |
|    agent/time/fps                    | 4463       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 61440      |
|    agent/train/approx_kl             | 0.00500011 |
|    agent/train/clip_fraction         | 0.166      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.358     |
|    agent/train/explained_variance    | 0.967      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.0248    |
|    agent/train/n_updates             | 290        |
|    agent/train/policy_gradient_loss  | -0.00783 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.3         |
|    agent/rollout/ep_rew_wrapped_mean | -66.8        |
|    agent/time/fps                    | 4364         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 63488        |
|    agent/train/approx_kl             | 0.0026273313 |
|    agent/train/clip_fraction         | 0.124        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.346       |
|    agent/train/explained_variance    | 0.97         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00554     |
|    agent/train/n_updates             | 300          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 13.6         |
|    agent/rollout/ep_rew_wrapped_mean | -63.8        |
|    agent/time/fps                    | 4370         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 65536        |
|    agent/train/approx_kl             | 0.0039875316 |
|    agent/train/clip_fraction         | 0.155        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.333       |
|    agent/train/explained_variance    | 0.963        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00602     |
|    agent/train/n_updates             | 310          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 13.4         |
|    agent/rollout/ep_rew_wrapped_mean | -62.3        |
|    agent/time/fps                    | 4272         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 67584        |
|    agent/train/approx_kl             | 0.0037887236 |
|    agent/train/clip_fraction         | 0.13         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.309       |
|    agent/train/explained_variance    | 0.969        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00909     |
|    agent/train/n_updates             | 320          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 13.2         |
|    agent/rollout/ep_rew_wrapped_mean | -61.8        |
|    agent/time/fps                    | 4410         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 69632        |
|    agent/train/approx_kl             | 0.0033434448 |
|    agent/train/clip_fraction         | 0.125        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.298       |
|    agent/train/explained_variance    | 0.972        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0107      |
|    agent/train/n_updates             | 330          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 12.9         |
|    agent/rollout/ep_rew_wrapped_mean | -63.1        |
|    agent/time/fps                    | 4295         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 71680        |
|    agent/train/approx_kl             | 0.0048154257 |
|    agent/train/clip_fraction         | 0.147        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.279       |
|    agent/train/explained_variance    | 0.977        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0183      |
|    agent/train/n_updates             | 340          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 12.7         |
|    agent/rollout/ep_rew_wrapped_mean | -64.5        |
|    agent/time/fps                    | 4364         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 73728        |
|    agent/train/approx_kl             | 0.0030802684 |
|    agent/train/clip_fraction         | 0.109        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.257       |
|    agent/train/explained_variance    | 0.978        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00162     |
|    agent/train/n_updates             | 350          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 12.2         |
|    agent/rollout/ep_rew_wrapped_mean | -65.5        |
|    agent/time/fps                    | 4356         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 75776        |
|    agent/train/approx_kl             | 0.0042802296 |
|    agent/train/clip_fraction         | 0.0954       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.23        |
|    agent/train/explained_variance    | 0.971        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0076      |
|    agent/train/n_updates             | 360          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 11.7         |
|    agent/rollout/ep_rew_wrapped_mean | -66.6        |
|    agent/time/fps                    | 4407         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 77824        |
|    agent/train/approx_kl             | 0.0022856756 |
|    agent/train/clip_fraction         | 0.0682       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.213       |
|    agent/train/explained_variance    | 0.968        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00311     |
|    agent/train/n_updates             | 370          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 10.9        |
|    agent/rollout/ep_rew_wrapped_mean | -67.5       |
|    agent/time/fps                    | 4326        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 79872       |
|    agent/train/approx_kl             | 0.005977839 |
|    agent/train/clip_fraction         | 0.0825      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.239      |
|    agent/train/explained_variance    | 0.975       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0166     |
|    agent/train/n_updates             | 380         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 10.7         |
|    agent/rollout/ep_rew_wrapped_mean | -68.6        |
|    agent/time/fps                    | 4441         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 81920        |
|    agent/train/approx_kl             | 0.0018058864 |
|    agent/train/clip_fraction         | 0.0766       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.236       |
|    agent/train/explained_variance    | 0.97         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00436     |
|    agent/train/n_updates             | 390          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 10           |
|    agent/rollout/ep_rew_wrapped_mean | -69.6        |
|    agent/time/fps                    | 4409         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 83968        |
|    agent/train/approx_kl             | 0.0021602344 |
|    agent/train/clip_fraction         | 0.113        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.242       |
|    agent/train/explained_variance    | 0.974        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00234      |
|    agent/train/n_updates             | 400          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 9.84         |
|    agent/rollout/ep_rew_wrapped_mean | -70.4        |
|    agent/time/fps                    | 4277         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 86016        |
|    agent/train/approx_kl             | 0.0014321147 |
|    agent/train/clip_fraction         | 0.0991       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.248       |
|    agent/train/explained_variance    | 0.952        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00222     |
|    agent/train/n_updates             | 410          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 9.31        |
|    agent/rollout/ep_rew_wrapped_mean | -70.9       |
|    agent/time/fps                    | 4417        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 88064       |
|    agent/train/approx_kl             | 0.003010783 |
|    agent/train/clip_fraction         | 0.125       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.25       |
|    agent/train/explained_variance    | 0.976       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00662    |
|    agent/train/n_updates             | 420         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 9.12         |
|    agent/rollout/ep_rew_wrapped_mean | -71.2        |
|    agent/time/fps                    | 4297         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 90112        |
|    agent/train/approx_kl             | 0.0022903667 |
|    agent/train/clip_fraction         | 0.0887       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.231       |
|    agent/train/explained_variance    | 0.979        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00968     |
|    agent/train/n_updates             | 430          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.94         |
|    agent/rollout/ep_rew_wrapped_mean | -71.7        |
|    agent/time/fps                    | 4482         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 92160        |
|    agent/train/approx_kl             | 0.0039223665 |
|    agent/train/clip_fraction         | 0.109        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.231       |
|    agent/train/explained_variance    | 0.982        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00412      |
|    agent/train/n_updates             | 440          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.87         |
|    agent/rollout/ep_rew_wrapped_mean | -71.8        |
|    agent/time/fps                    | 4392         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 94208        |
|    agent/train/approx_kl             | 0.0036269007 |
|    agent/train/clip_fraction         | 0.0783       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.25        |
|    agent/train/explained_variance    | 0.981        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.033        |
|    agent/train/n_updates             | 450          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.84         |
|    agent/rollout/ep_rew_wrapped_mean | -71.7        |
|    agent/time/fps                    | 4420         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 96256        |
|    agent/train/approx_kl             | 0.0078082466 |
|    agent/train/clip_fraction         | 0.124        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.271       |
|    agent/train/explained_variance    | 0.984        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00576     |
|    agent/train/n_updates             | 460          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.68         |
|    agent/rollout/ep_rew_wrapped_mean | -71.8        |
|    agent/time/fps                    | 4351         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 98304        |
|    agent/train/approx_kl             | 0.0044145407 |
|    agent/train/clip_fraction         | 0.0977       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.285       |
|    agent/train/explained_variance    | 0.962        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.000879    |
|    agent/train/n_updates             | 470          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.7         |
|    agent/rollout/ep_rew_wrapped_mean | -71.8       |
|    agent/time/fps                    | 4448        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 100352      |
|    agent/train/approx_kl             | 0.003282858 |
|    agent/train/clip_fraction         | 0.125       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.301      |
|    agent/train/explained_variance    | 0.979       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0128     |
|    agent/train/n_updates             | 480         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.7          |
|    agent/rollout/ep_rew_wrapped_mean | -71.8        |
|    agent/time/fps                    | 4458         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 102400       |
|    agent/train/approx_kl             | 0.0034436057 |
|    agent/train/clip_fraction         | 0.105        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.302       |
|    agent/train/explained_variance    | 0.983        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.011       |
|    agent/train/n_updates             | 490          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.65         |
|    agent/rollout/ep_rew_wrapped_mean | -72          |
|    agent/time/fps                    | 4364         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 104448       |
|    agent/train/approx_kl             | 0.0036194306 |
|    agent/train/clip_fraction         | 0.115        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.324       |
|    agent/train/explained_variance    | 0.975        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00264      |
|    agent/train/n_updates             | 500          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.65        |
|    agent/rollout/ep_rew_wrapped_mean | -72.3       |
|    agent/time/fps                    | 3857        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 106496      |
|    agent/train/approx_kl             | 0.004013801 |
|    agent/train/clip_fraction         | 0.126       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.327      |
|    agent/train/explained_variance    | 0.979       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.02       |
|    agent/train/n_updates             | 510         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.65         |
|    agent/rollout/ep_rew_wrapped_mean | -72.7        |
|    agent/time/fps                    | 3667         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 108544       |
|    agent/train/approx_kl             | 0.0051187295 |
|    agent/train/clip_fraction         | 0.131        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.34        |
|    agent/train/explained_variance    | 0.976        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00121     |
|    agent/train/n_updates             | 520          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.57        |
|    agent/rollout/ep_rew_wrapped_mean | -73.3       |
|    agent/time/fps                    | 2813        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 110592      |
|    agent/train/approx_kl             | 0.004947768 |
|    agent/train/clip_fraction         | 0.141       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.335      |
|    agent/train/explained_variance    | 0.967       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.011      |
|    agent/train/n_updates             | 530         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.49         |
|    agent/rollout/ep_rew_wrapped_mean | -73.9        |
|    agent/time/fps                    | 3562         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 112640       |
|    agent/train/approx_kl             | 0.0028157057 |
|    agent/train/clip_fraction         | 0.117        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.35        |
|    agent/train/explained_variance    | 0.951        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0095      |
|    agent/train/n_updates             | 540          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.61         |
|    agent/rollout/ep_rew_wrapped_mean | -74.6        |
|    agent/time/fps                    | 4335         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 114688       |
|    agent/train/approx_kl             | 0.0059402366 |
|    agent/train/clip_fraction         | 0.155        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.371       |
|    agent/train/explained_variance    | 0.961        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.035       |
|    agent/train/n_updates             | 550          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.67         |
|    agent/rollout/ep_rew_wrapped_mean | -75.1        |
|    agent/time/fps                    | 4128         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 116736       |
|    agent/train/approx_kl             | 0.0028314136 |
|    agent/train/clip_fraction         | 0.121        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.404       |
|    agent/train/explained_variance    | 0.943        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00315      |
|    agent/train/n_updates             | 560          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 8.71       |
|    agent/rollout/ep_rew_wrapped_mean | -75.6      |
|    agent/time/fps                    | 4489       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 118784     |
|    agent/train/approx_kl             | 0.01035477 |
|    agent/train/clip_fraction         | 0.225      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.42      |
|    agent/train/explained_variance    | 0.965      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.0123     |
|    agent/train/n_updates             | 570        |
|    agent/train/policy_gradient_loss  | -0.00858 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.76        |
|    agent/rollout/ep_rew_wrapped_mean | -76.1       |
|    agent/time/fps                    | 4087        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 120832      |
|    agent/train/approx_kl             | 0.008132137 |
|    agent/train/clip_fraction         | 0.221       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.456      |
|    agent/train/explained_variance    | 0.953       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0315     |
|    agent/train/n_updates             | 580         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 103 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.78         |
|    agent/rollout/ep_rew_wrapped_mean | -76.5        |
|    agent/time/fps                    | 4206         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 122880       |
|    agent/train/approx_kl             | 0.0059365407 |
|    agent/train/clip_fraction         | 0.207        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.461       |
|    agent/train/explained_variance    | 0.978        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00921     |
|    agent/train/n_updates             | 590          |
|    agent/trai

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.8          |
|    agent/rollout/ep_rew_wrapped_mean | -76.8        |
|    agent/time/fps                    | 4385         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 124928       |
|    agent/train/approx_kl             | 0.0055794134 |
|    agent/train/clip_fraction         | 0.206        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.431       |
|    agent/train/explained_variance    | 0.975        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0315      |
|    agent/train/n_updates             | 600          |
|    agent/train



VBox(children=(Label(value='0.112 MB of 0.112 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,█▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
time/fps,█▁▃▃▂▂▃▃▃▃▃▄▃▃▃▄▄▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃
train/approx_kl,▃▃▆▂▃▇▃▄▃▅▄▂▇▃▂▆▂▄█▂▁▂▃▃▂▄▄▂▄▁▂█▃▃▂▂▂▃▂▂
train/clip_fraction,█▅▇▂▂▂▁▁▁▂▃▁▂▁▁▂▁▂▃▁▁▂▁▁▁▁▃▁▅▁▁▁▂▁▁▁▁▂▁▂
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▄▇▇▇████▇████▇█▇▇███████▇█▅███████████
train/explained_variance,▁▇██████████████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,10.33
time/fps,2879.0
train/approx_kl,0.00429
train/clip_fraction,0.06304
train/clip_range,0.1
train/entropy_loss,-0.0613
train/explained_variance,0.98952
train/learning_rate,0.002


100%|████████████████████████████████████████████| 5/5 [24:35<00:00, 295.05s/it]
  0%|                                                     | 0/5 [00:00<?, ?it/s]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011171594911461902, max=1.0…

Query schedule: [20, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Collecting 40 fragments (8000 transitions)
Requested 4800 transitions but only 0 in buffer. Sampling 4800 additional transitions.
Sampling 3200 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 20 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 42.2     |
|    agent/rollout/ep_rew_wrapped_mean | 75.1     |
|    agent/time/fps                    | 4480     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 42.2     |
|    agent/rollout/ep_rew_wrapped_mean | 75.1     |
|    agent/time/fps                    | 4.48e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 34.4        |
|    agent/rollout/ep_rew_wrapped_mean | 43.6        |
|    agent/time/fps                    | 3776        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 4096        |
|    agent/train/approx_kl             | 0.002890016 |
|    agent/train/clip_fraction         | 0.159       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.69       |
|    agent/train/explained_variance    | -0.52       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00475    |
|    agent/train/n_updates             | 10          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.8         |
|    agent/rollout/ep_rew_wrapped_mean | 23.3         |
|    agent/time/fps                    | 4500         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0015386392 |
|    agent/train/clip_fraction         | 0.0376       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.691       |
|    agent/train/explained_variance    | -0.152       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00165      |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
----------------------------------------------------
| raw/                                 |           |
|    agent/rollout/ep_len_mean         | 500       |
|    agent/rollout/ep_rew_mean         | 26.5      |
|    agent/rollout/ep_rew_wrapped_mean | 9.49      |
|    agent/time/fps                    | 4513      |
|    agent/time/iterations             | 1         |
|    agent/time/time_elapsed           | 0         |
|    agent/time/total_timesteps        | 8192      |
|    agent/train/approx_kl             | 0.0025696 |
|    agent/train/clip_fraction         | 0.104     |
|    agent/train/clip_range            | 0.1       |
|    agent/train/entropy_loss          | -0.687    |
|    agent/train/explained_variance    | 0.56      |
|    agent/train/learning_rate         | 0.002     |
|    agent/train/loss                  | 0.0329    |
|    agent/train/n_updates             | 30        |
|    agent/train/policy_gradient_loss  | -0.0033   |
|    agent/tra

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26           |
|    agent/rollout/ep_rew_wrapped_mean | -1.88        |
|    agent/time/fps                    | 4540         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0028326958 |
|    agent/train/clip_fraction         | 0.177        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.685       |
|    agent/train/explained_variance    | 0.76         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0122      |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.5         |
|    agent/rollout/ep_rew_wrapped_mean | -11.5        |
|    agent/time/fps                    | 4536         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0028124363 |
|    agent/train/clip_fraction         | 0.12         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.67        |
|    agent/train/explained_variance    | 0.813        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00142      |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 22.5        |
|    agent/rollout/ep_rew_wrapped_mean | -24.2       |
|    agent/time/fps                    | 4382        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 14336       |
|    agent/train/approx_kl             | 0.003558467 |
|    agent/train/clip_fraction         | 0.195       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.662      |
|    agent/train/explained_variance    | 0.742       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0407      |
|    agent/train/n_updates             | 60          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.2         |
|    agent/rollout/ep_rew_wrapped_mean | -30          |
|    agent/time/fps                    | 4308         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0030635549 |
|    agent/train/clip_fraction         | 0.173        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.647       |
|    agent/train/explained_variance    | 0.491        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -7.99e-05    |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.1         |
|    agent/rollout/ep_rew_wrapped_mean | -37.8        |
|    agent/time/fps                    | 4525         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0022560502 |
|    agent/train/clip_fraction         | 0.111        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.636       |
|    agent/train/explained_variance    | 0.331        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00938      |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19.5         |
|    agent/rollout/ep_rew_wrapped_mean | -43.4        |
|    agent/time/fps                    | 4521         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0023528365 |
|    agent/train/clip_fraction         | 0.111        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.618       |
|    agent/train/explained_variance    | 0.896        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00739     |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19.1         |
|    agent/rollout/ep_rew_wrapped_mean | -48.9        |
|    agent/time/fps                    | 4270         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0042012697 |
|    agent/train/clip_fraction         | 0.124        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.617       |
|    agent/train/explained_variance    | 0.857        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0101       |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.8         |
|    agent/rollout/ep_rew_wrapped_mean | -53.9        |
|    agent/time/fps                    | 4542         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0019694595 |
|    agent/train/clip_fraction         | 0.102        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.605       |
|    agent/train/explained_variance    | 0.923        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00127      |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.2         |
|    agent/rollout/ep_rew_wrapped_mean | -57.7        |
|    agent/time/fps                    | 4424         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 26624        |
|    agent/train/approx_kl             | 0.0025461754 |
|    agent/train/clip_fraction         | 0.121        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.595       |
|    agent/train/explained_variance    | 0.961        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0125      |
|    agent/train/n_updates             | 120          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17.9         |
|    agent/rollout/ep_rew_wrapped_mean | -60.9        |
|    agent/time/fps                    | 4035         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 28672        |
|    agent/train/approx_kl             | 0.0019391333 |
|    agent/train/clip_fraction         | 0.0983       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.578       |
|    agent/train/explained_variance    | 0.975        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00812     |
|    agent/train/n_updates             | 130          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17.6         |
|    agent/rollout/ep_rew_wrapped_mean | -65.8        |
|    agent/time/fps                    | 4513         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0040879417 |
|    agent/train/clip_fraction         | 0.147        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.575       |
|    agent/train/explained_variance    | 0.974        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0109       |
|    agent/train/n_updates             | 140          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.4         |
|    agent/rollout/ep_rew_wrapped_mean | -77.4        |
|    agent/time/fps                    | 4506         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0043444815 |
|    agent/train/clip_fraction         | 0.132        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.581       |
|    agent/train/explained_variance    | 0.778        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00692     |
|    agent/train/n_updates             | 150          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.1         |
|    agent/rollout/ep_rew_wrapped_mean | -85.5        |
|    agent/time/fps                    | 4507         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0025117383 |
|    agent/train/clip_fraction         | 0.122        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.581       |
|    agent/train/explained_variance    | 0.422        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00793     |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18           |
|    agent/rollout/ep_rew_wrapped_mean | -97.9        |
|    agent/time/fps                    | 4466         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 36864        |
|    agent/train/approx_kl             | 0.0037482693 |
|    agent/train/clip_fraction         | 0.181        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.567       |
|    agent/train/explained_variance    | 0.91         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00215      |
|    agent/train/n_updates             | 170          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.1         |
|    agent/rollout/ep_rew_wrapped_mean | -102         |
|    agent/time/fps                    | 4483         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 38912        |
|    agent/train/approx_kl             | 0.0039452612 |
|    agent/train/clip_fraction         | 0.176        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.563       |
|    agent/train/explained_variance    | 0.954        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00536      |
|    agent/train/n_updates             | 180          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.2         |
|    agent/rollout/ep_rew_wrapped_mean | -106         |
|    agent/time/fps                    | 4495         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 40960        |
|    agent/train/approx_kl             | 0.0058073597 |
|    agent/train/clip_fraction         | 0.229        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.569       |
|    agent/train/explained_variance    | 0.956        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0066       |
|    agent/train/n_updates             | 190          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.8         |
|    agent/rollout/ep_rew_wrapped_mean | -109         |
|    agent/time/fps                    | 4467         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0049780197 |
|    agent/train/clip_fraction         | 0.262        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.572       |
|    agent/train/explained_variance    | 0.977        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.000722    |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19           |
|    agent/rollout/ep_rew_wrapped_mean | -112         |
|    agent/time/fps                    | 4428         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 45056        |
|    agent/train/approx_kl             | 0.0027561048 |
|    agent/train/clip_fraction         | 0.166        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.579       |
|    agent/train/explained_variance    | 0.902        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0144       |
|    agent/train/n_updates             | 210          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19.8         |
|    agent/rollout/ep_rew_wrapped_mean | -115         |
|    agent/time/fps                    | 4469         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 47104        |
|    agent/train/approx_kl             | 0.0024611359 |
|    agent/train/clip_fraction         | 0.115        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.631       |
|    agent/train/explained_variance    | 0.773        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0277       |
|    agent/train/n_updates             | 220          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 20.9       |
|    agent/rollout/ep_rew_wrapped_mean | -117       |
|    agent/time/fps                    | 3865       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 49152      |
|    agent/train/approx_kl             | 0.00362152 |
|    agent/train/clip_fraction         | 0.196      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.629     |
|    agent/train/explained_variance    | 0.942      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.0405     |
|    agent/train/n_updates             | 230        |
|    agent/train/policy_gradient_loss  | -0.00771 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 21.8        |
|    agent/rollout/ep_rew_wrapped_mean | -115        |
|    agent/time/fps                    | 4205        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 51200       |
|    agent/train/approx_kl             | 0.003646283 |
|    agent/train/clip_fraction         | 0.194       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.621      |
|    agent/train/explained_variance    | 0.942       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0355      |
|    agent/train/n_updates             | 240         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.6         |
|    agent/rollout/ep_rew_wrapped_mean | -115         |
|    agent/time/fps                    | 4488         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 53248        |
|    agent/train/approx_kl             | 0.0029249948 |
|    agent/train/clip_fraction         | 0.181        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.613       |
|    agent/train/explained_variance    | 0.897        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0812       |
|    agent/train/n_updates             | 250          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.1         |
|    agent/rollout/ep_rew_wrapped_mean | -113         |
|    agent/time/fps                    | 4353         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 55296        |
|    agent/train/approx_kl             | 0.0032081641 |
|    agent/train/clip_fraction         | 0.176        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.636       |
|    agent/train/explained_variance    | 0.95         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0343       |
|    agent/train/n_updates             | 260          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.7         |
|    agent/rollout/ep_rew_wrapped_mean | -111         |
|    agent/time/fps                    | 4472         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 57344        |
|    agent/train/approx_kl             | 0.0038638045 |
|    agent/train/clip_fraction         | 0.183        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.641       |
|    agent/train/explained_variance    | 0.948        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0306       |
|    agent/train/n_updates             | 270          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.9         |
|    agent/rollout/ep_rew_wrapped_mean | -109         |
|    agent/time/fps                    | 4297         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 59392        |
|    agent/train/approx_kl             | 0.0048590293 |
|    agent/train/clip_fraction         | 0.219        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.636       |
|    agent/train/explained_variance    | 0.954        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00856      |
|    agent/train/n_updates             | 280          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 24.8        |
|    agent/rollout/ep_rew_wrapped_mean | -106        |
|    agent/time/fps                    | 3560        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 61440       |
|    agent/train/approx_kl             | 0.004016774 |
|    agent/train/clip_fraction         | 0.224       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.629      |
|    agent/train/explained_variance    | 0.951       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0284      |
|    agent/train/n_updates             | 290         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 25.6        |
|    agent/rollout/ep_rew_wrapped_mean | -103        |
|    agent/time/fps                    | 4456        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 63488       |
|    agent/train/approx_kl             | 0.005007066 |
|    agent/train/clip_fraction         | 0.322       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.645      |
|    agent/train/explained_variance    | 0.963       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00867     |
|    agent/train/n_updates             | 300         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 27          |
|    agent/rollout/ep_rew_wrapped_mean | -100        |
|    agent/time/fps                    | 4296        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 65536       |
|    agent/train/approx_kl             | 0.003771418 |
|    agent/train/clip_fraction         | 0.192       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.631      |
|    agent/train/explained_variance    | 0.769       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0279      |
|    agent/train/n_updates             | 310         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 28.6        |
|    agent/rollout/ep_rew_wrapped_mean | -97.3       |
|    agent/time/fps                    | 4276        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 67584       |
|    agent/train/approx_kl             | 0.004263658 |
|    agent/train/clip_fraction         | 0.323       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.64       |
|    agent/train/explained_variance    | 0.953       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.017       |
|    agent/train/n_updates             | 320         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.8         |
|    agent/rollout/ep_rew_wrapped_mean | -94.1        |
|    agent/time/fps                    | 4339         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 69632        |
|    agent/train/approx_kl             | 0.0043608947 |
|    agent/train/clip_fraction         | 0.313        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.638       |
|    agent/train/explained_variance    | 0.921        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.02         |
|    agent/train/n_updates             | 330          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.5         |
|    agent/rollout/ep_rew_wrapped_mean | -90.3        |
|    agent/time/fps                    | 4187         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 71680        |
|    agent/train/approx_kl             | 0.0026284927 |
|    agent/train/clip_fraction         | 0.178        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.639       |
|    agent/train/explained_variance    | 0.939        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0279       |
|    agent/train/n_updates             | 340          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 35.8         |
|    agent/rollout/ep_rew_wrapped_mean | -87.3        |
|    agent/time/fps                    | 4496         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 73728        |
|    agent/train/approx_kl             | 0.0037581027 |
|    agent/train/clip_fraction         | 0.222        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.629       |
|    agent/train/explained_variance    | 0.966        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0332       |
|    agent/train/n_updates             | 350          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 40.9         |
|    agent/rollout/ep_rew_wrapped_mean | -84.4        |
|    agent/time/fps                    | 4302         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 75776        |
|    agent/train/approx_kl             | 0.0030944683 |
|    agent/train/clip_fraction         | 0.169        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.636       |
|    agent/train/explained_variance    | 0.952        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.051        |
|    agent/train/n_updates             | 360          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 44.8         |
|    agent/rollout/ep_rew_wrapped_mean | -80.6        |
|    agent/time/fps                    | 4240         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 77824        |
|    agent/train/approx_kl             | 0.0033501736 |
|    agent/train/clip_fraction         | 0.177        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.613       |
|    agent/train/explained_variance    | 0.979        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0402       |
|    agent/train/n_updates             | 370          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 48.8        |
|    agent/rollout/ep_rew_wrapped_mean | -77.4       |
|    agent/time/fps                    | 4417        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 79872       |
|    agent/train/approx_kl             | 0.003541849 |
|    agent/train/clip_fraction         | 0.211       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.624      |
|    agent/train/explained_variance    | 0.976       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00578     |
|    agent/train/n_updates             | 380         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 51.5         |
|    agent/rollout/ep_rew_wrapped_mean | -74.1        |
|    agent/time/fps                    | 4374         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 81920        |
|    agent/train/approx_kl             | 0.0034062306 |
|    agent/train/clip_fraction         | 0.157        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.62        |
|    agent/train/explained_variance    | 0.972        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0229       |
|    agent/train/n_updates             | 390          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 55.7        |
|    agent/rollout/ep_rew_wrapped_mean | -71.6       |
|    agent/time/fps                    | 4325        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 83968       |
|    agent/train/approx_kl             | 0.004494318 |
|    agent/train/clip_fraction         | 0.197       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.603      |
|    agent/train/explained_variance    | 0.957       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0524      |
|    agent/train/n_updates             | 400         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 57.5        |
|    agent/rollout/ep_rew_wrapped_mean | -69.1       |
|    agent/time/fps                    | 4326        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 86016       |
|    agent/train/approx_kl             | 0.003242113 |
|    agent/train/clip_fraction         | 0.185       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.584      |
|    agent/train/explained_variance    | 0.988       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0283      |
|    agent/train/n_updates             | 410         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 59           |
|    agent/rollout/ep_rew_wrapped_mean | -67.4        |
|    agent/time/fps                    | 4361         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 88064        |
|    agent/train/approx_kl             | 0.0024295924 |
|    agent/train/clip_fraction         | 0.096        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.548       |
|    agent/train/explained_variance    | 0.935        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.409        |
|    agent/train/n_updates             | 420          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 60.6         |
|    agent/rollout/ep_rew_wrapped_mean | -65.5        |
|    agent/time/fps                    | 4331         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 90112        |
|    agent/train/approx_kl             | 0.0029288768 |
|    agent/train/clip_fraction         | 0.133        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.554       |
|    agent/train/explained_variance    | 0.956        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.21         |
|    agent/train/n_updates             | 430          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 63.7        |
|    agent/rollout/ep_rew_wrapped_mean | -64.4       |
|    agent/time/fps                    | 4282        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 92160       |
|    agent/train/approx_kl             | 0.006113228 |
|    agent/train/clip_fraction         | 0.194       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.556      |
|    agent/train/explained_variance    | 0.976       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0416      |
|    agent/train/n_updates             | 440         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 66.3        |
|    agent/rollout/ep_rew_wrapped_mean | -63.8       |
|    agent/time/fps                    | 4277        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 94208       |
|    agent/train/approx_kl             | 0.002821066 |
|    agent/train/clip_fraction         | 0.107       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.534      |
|    agent/train/explained_variance    | 0.98        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.103       |
|    agent/train/n_updates             | 450         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 68.5         |
|    agent/rollout/ep_rew_wrapped_mean | -62.9        |
|    agent/time/fps                    | 4479         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 96256        |
|    agent/train/approx_kl             | 0.0021785712 |
|    agent/train/clip_fraction         | 0.111        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.526       |
|    agent/train/explained_variance    | 0.983        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.246        |
|    agent/train/n_updates             | 460          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 69.5        |
|    agent/rollout/ep_rew_wrapped_mean | -63.2       |
|    agent/time/fps                    | 4038        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 98304       |
|    agent/train/approx_kl             | 0.002430102 |
|    agent/train/clip_fraction         | 0.0996      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.516      |
|    agent/train/explained_variance    | 0.985       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.231       |
|    agent/train/n_updates             | 470         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 69.7         |
|    agent/rollout/ep_rew_wrapped_mean | -64.1        |
|    agent/time/fps                    | 4400         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 100352       |
|    agent/train/approx_kl             | 0.0022009104 |
|    agent/train/clip_fraction         | 0.105        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.524       |
|    agent/train/explained_variance    | 0.972        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.468        |
|    agent/train/n_updates             | 480          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 70           |
|    agent/rollout/ep_rew_wrapped_mean | -65.4        |
|    agent/time/fps                    | 4474         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 102400       |
|    agent/train/approx_kl             | 0.0032132226 |
|    agent/train/clip_fraction         | 0.109        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.515       |
|    agent/train/explained_variance    | 0.973        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.582        |
|    agent/train/n_updates             | 490          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 71.6         |
|    agent/rollout/ep_rew_wrapped_mean | -68.3        |
|    agent/time/fps                    | 4289         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 104448       |
|    agent/train/approx_kl             | 0.0033765053 |
|    agent/train/clip_fraction         | 0.159        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.461       |
|    agent/train/explained_variance    | 0.977        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.934        |
|    agent/train/n_updates             | 500          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 73.4        |
|    agent/rollout/ep_rew_wrapped_mean | -69.6       |
|    agent/time/fps                    | 4473        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 106496      |
|    agent/train/approx_kl             | 0.004520423 |
|    agent/train/clip_fraction         | 0.156       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.467      |
|    agent/train/explained_variance    | 0.983       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.644       |
|    agent/train/n_updates             | 510         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 75.3         |
|    agent/rollout/ep_rew_wrapped_mean | -71          |
|    agent/time/fps                    | 4329         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 108544       |
|    agent/train/approx_kl             | 0.0066053653 |
|    agent/train/clip_fraction         | 0.16         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.473       |
|    agent/train/explained_variance    | 0.987        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.398        |
|    agent/train/n_updates             | 520          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 79           |
|    agent/rollout/ep_rew_wrapped_mean | -73          |
|    agent/time/fps                    | 4208         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 110592       |
|    agent/train/approx_kl             | 0.0051064016 |
|    agent/train/clip_fraction         | 0.212        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.491       |
|    agent/train/explained_variance    | 0.989        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.243        |
|    agent/train/n_updates             | 530          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 84.8         |
|    agent/rollout/ep_rew_wrapped_mean | -72          |
|    agent/time/fps                    | 4025         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 112640       |
|    agent/train/approx_kl             | 0.0058699246 |
|    agent/train/clip_fraction         | 0.18         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.506       |
|    agent/train/explained_variance    | 0.992        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.176        |
|    agent/train/n_updates             | 540          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 89.6         |
|    agent/rollout/ep_rew_wrapped_mean | -69.5        |
|    agent/time/fps                    | 4356         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 114688       |
|    agent/train/approx_kl             | 0.0027547956 |
|    agent/train/clip_fraction         | 0.154        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.543       |
|    agent/train/explained_variance    | 0.971        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.315        |
|    agent/train/n_updates             | 550          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 97          |
|    agent/rollout/ep_rew_wrapped_mean | -67.4       |
|    agent/time/fps                    | 4417        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 116736      |
|    agent/train/approx_kl             | 0.003862815 |
|    agent/train/clip_fraction         | 0.234       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.535      |
|    agent/train/explained_variance    | 0.982       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.234       |
|    agent/train/n_updates             | 560         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 104          |
|    agent/rollout/ep_rew_wrapped_mean | -65.2        |
|    agent/time/fps                    | 4313         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 118784       |
|    agent/train/approx_kl             | 0.0066082394 |
|    agent/train/clip_fraction         | 0.212        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.53        |
|    agent/train/explained_variance    | 0.996        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.063        |
|    agent/train/n_updates             | 570          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 112        |
|    agent/rollout/ep_rew_wrapped_mean | -60.6      |
|    agent/time/fps                    | 4041       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 120832     |
|    agent/train/approx_kl             | 0.00294998 |
|    agent/train/clip_fraction         | 0.164      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.542     |
|    agent/train/explained_variance    | 0.987      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.453      |
|    agent/train/n_updates             | 580        |
|    agent/train/policy_gradient_loss  | 0.000855 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 103 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 122         |
|    agent/rollout/ep_rew_wrapped_mean | -54.1       |
|    agent/time/fps                    | 4054        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 122880      |
|    agent/train/approx_kl             | 0.005342375 |
|    agent/train/clip_fraction         | 0.237       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.549      |
|    agent/train/explained_variance    | 0.994       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0849      |
|    agent/train/n_updates             | 590         |
|    agent/train/policy_gradient

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 127          |
|    agent/rollout/ep_rew_wrapped_mean | -45.3        |
|    agent/time/fps                    | 4165         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 124928       |
|    agent/train/approx_kl             | 0.0044905352 |
|    agent/train/clip_fraction         | 0.221        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.562       |
|    agent/train/explained_variance    | 0.976        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.141        |
|    agent/train/n_updates             | 600          |
|    agent/train



VBox(children=(Label(value='0.110 MB of 0.110 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▁▁▁▁▁▁▂▂▄▅▇█████████████████████████████
time/fps,█▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/approx_kl,▂▂▂▂▂▃▅▄▆█▁▂▇▂▂▄▄▄▁▆▃▂▂▅▄▂▅▃▂▆▃▄▄▇▆█▅▃▂▃
train/clip_fraction,▅▂▄▄▄▄▇▅█▅▄▃▄▃▄▃▂▂▄▃▃▅▃▃▂▃▃▂▃▃▂▁▂▂▂▂▂▂▂▁
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▁▁▂▂▃▄▄▃▅▇▇▇▇▇███▇██▇████▇███▇█████████
train/explained_variance,▁▇▇█████████████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,500.0
time/fps,2838.0
train/approx_kl,0.00385
train/clip_fraction,0.0769
train/clip_range,0.1
train/entropy_loss,-0.1557
train/explained_variance,0.99996
train/learning_rate,0.002


 20%|████████▊                                   | 1/5 [04:59<19:56, 299.03s/it]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011124101389820376, max=1.0…

Query schedule: [20, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Collecting 40 fragments (8000 transitions)
Requested 4800 transitions but only 0 in buffer. Sampling 4800 additional transitions.
Sampling 3200 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 20 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 34.5     |
|    agent/rollout/ep_rew_wrapped_mean | 294      |
|    agent/time/fps                    | 4438     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 34.5     |
|    agent/rollout/ep_rew_wrapped_mean | 294      |
|    agent/time/fps                    | 4.44e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.2         |
|    agent/rollout/ep_rew_wrapped_mean | 232          |
|    agent/time/fps                    | 4446         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0017614188 |
|    agent/train/clip_fraction         | 0.0547       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.692       |
|    agent/train/explained_variance    | 0.0609       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00495      |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 32.1        |
|    agent/rollout/ep_rew_wrapped_mean | 197         |
|    agent/time/fps                    | 4493        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 6144        |
|    agent/train/approx_kl             | 0.004410781 |
|    agent/train/clip_fraction         | 0.272       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.681      |
|    agent/train/explained_variance    | 0.568       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0176      |
|    agent/train/n_updates             | 20          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29.4        |
|    agent/rollout/ep_rew_wrapped_mean | 172         |
|    agent/time/fps                    | 4472        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 8192        |
|    agent/train/approx_kl             | 0.005199504 |
|    agent/train/clip_fraction         | 0.385       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.656      |
|    agent/train/explained_variance    | 0.775       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0347      |
|    agent/train/n_updates             | 30          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 26.1        |
|    agent/rollout/ep_rew_wrapped_mean | 146         |
|    agent/time/fps                    | 4537        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 10240       |
|    agent/train/approx_kl             | 0.003810015 |
|    agent/train/clip_fraction         | 0.159       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.628      |
|    agent/train/explained_variance    | 0.76        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.047       |
|    agent/train/n_updates             | 40          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 23.8       |
|    agent/rollout/ep_rew_wrapped_mean | 119        |
|    agent/time/fps                    | 4436       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 12288      |
|    agent/train/approx_kl             | 0.00722595 |
|    agent/train/clip_fraction         | 0.203      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.589     |
|    agent/train/explained_variance    | 0.865      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.305      |
|    agent/train/n_updates             | 50         |
|    agent/train/policy_gradient_loss  | -0.00438 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 21.7        |
|    agent/rollout/ep_rew_wrapped_mean | 98.4        |
|    agent/time/fps                    | 4524        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 14336       |
|    agent/train/approx_kl             | 0.008585606 |
|    agent/train/clip_fraction         | 0.174       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.552      |
|    agent/train/explained_variance    | 0.985       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00965    |
|    agent/train/n_updates             | 60          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.4         |
|    agent/rollout/ep_rew_wrapped_mean | 87.6         |
|    agent/time/fps                    | 4492         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0029280838 |
|    agent/train/clip_fraction         | 0.168        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.561       |
|    agent/train/explained_variance    | 0.961        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.000935     |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 20          |
|    agent/rollout/ep_rew_wrapped_mean | 82          |
|    agent/time/fps                    | 4499        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 18432       |
|    agent/train/approx_kl             | 0.003371438 |
|    agent/train/clip_fraction         | 0.125       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.601      |
|    agent/train/explained_variance    | 0.783       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0392      |
|    agent/train/n_updates             | 80          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19.5         |
|    agent/rollout/ep_rew_wrapped_mean | 77.3         |
|    agent/time/fps                    | 4522         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0013904598 |
|    agent/train/clip_fraction         | 0.0523       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.606       |
|    agent/train/explained_variance    | 0.848        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0098       |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 19.5        |
|    agent/rollout/ep_rew_wrapped_mean | 73.4        |
|    agent/time/fps                    | 4478        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 22528       |
|    agent/train/approx_kl             | 0.002054454 |
|    agent/train/clip_fraction         | 0.102       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.604      |
|    agent/train/explained_variance    | 0.881       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00824    |
|    agent/train/n_updates             | 100         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.9         |
|    agent/rollout/ep_rew_wrapped_mean | 70           |
|    agent/time/fps                    | 3927         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0029113789 |
|    agent/train/clip_fraction         | 0.138        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.59        |
|    agent/train/explained_variance    | 0.924        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0201      |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 18.7        |
|    agent/rollout/ep_rew_wrapped_mean | 66.8        |
|    agent/time/fps                    | 4508        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 26624       |
|    agent/train/approx_kl             | 0.008427831 |
|    agent/train/clip_fraction         | 0.307       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.569      |
|    agent/train/explained_variance    | 0.923       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00573    |
|    agent/train/n_updates             | 120         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19.2         |
|    agent/rollout/ep_rew_wrapped_mean | 64           |
|    agent/time/fps                    | 4095         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 28672        |
|    agent/train/approx_kl             | 0.0054555377 |
|    agent/train/clip_fraction         | 0.229        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.57        |
|    agent/train/explained_variance    | 0.901        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00907      |
|    agent/train/n_updates             | 130          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 19.3        |
|    agent/rollout/ep_rew_wrapped_mean | 57.3        |
|    agent/time/fps                    | 4403        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 30720       |
|    agent/train/approx_kl             | 0.003824581 |
|    agent/train/clip_fraction         | 0.266       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.577      |
|    agent/train/explained_variance    | 0.958       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0117     |
|    agent/train/n_updates             | 140         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 20          |
|    agent/rollout/ep_rew_wrapped_mean | 40.2        |
|    agent/time/fps                    | 4333        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 32768       |
|    agent/train/approx_kl             | 0.005226516 |
|    agent/train/clip_fraction         | 0.398       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.629      |
|    agent/train/explained_variance    | 0.885       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00525    |
|    agent/train/n_updates             | 150         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.2         |
|    agent/rollout/ep_rew_wrapped_mean | 28.5         |
|    agent/time/fps                    | 3073         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0037090445 |
|    agent/train/clip_fraction         | 0.262        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.626       |
|    agent/train/explained_variance    | 0.877        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00988      |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 19.9        |
|    agent/rollout/ep_rew_wrapped_mean | 17.1        |
|    agent/time/fps                    | 4494        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 36864       |
|    agent/train/approx_kl             | 0.004337778 |
|    agent/train/clip_fraction         | 0.242       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.616      |
|    agent/train/explained_variance    | 0.483       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00271     |
|    agent/train/n_updates             | 170         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19.6         |
|    agent/rollout/ep_rew_wrapped_mean | 15.6         |
|    agent/time/fps                    | 4055         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 38912        |
|    agent/train/approx_kl             | 0.0054958477 |
|    agent/train/clip_fraction         | 0.231        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.585       |
|    agent/train/explained_variance    | 0.811        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0137      |
|    agent/train/n_updates             | 180          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19.2         |
|    agent/rollout/ep_rew_wrapped_mean | 13.8         |
|    agent/time/fps                    | 4475         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 40960        |
|    agent/train/approx_kl             | 0.0062230853 |
|    agent/train/clip_fraction         | 0.243        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.539       |
|    agent/train/explained_variance    | 0.88         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.000566    |
|    agent/train/n_updates             | 190          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.8         |
|    agent/rollout/ep_rew_wrapped_mean | 12.3         |
|    agent/time/fps                    | 4473         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0073899636 |
|    agent/train/clip_fraction         | 0.22         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.524       |
|    agent/train/explained_variance    | 0.896        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0178      |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 18.4        |
|    agent/rollout/ep_rew_wrapped_mean | 12.2        |
|    agent/time/fps                    | 3845        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 45056       |
|    agent/train/approx_kl             | 0.011629397 |
|    agent/train/clip_fraction         | 0.207       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.431      |
|    agent/train/explained_variance    | 0.872       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00255    |
|    agent/train/n_updates             | 210         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 18          |
|    agent/rollout/ep_rew_wrapped_mean | 13.9        |
|    agent/time/fps                    | 4457        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 47104       |
|    agent/train/approx_kl             | 0.004209072 |
|    agent/train/clip_fraction         | 0.11        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.392      |
|    agent/train/explained_variance    | 0.936       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0179      |
|    agent/train/n_updates             | 220         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 17.7        |
|    agent/rollout/ep_rew_wrapped_mean | 19.3        |
|    agent/time/fps                    | 4508        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 49152       |
|    agent/train/approx_kl             | 0.018591449 |
|    agent/train/clip_fraction         | 0.157       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.289      |
|    agent/train/explained_variance    | 0.92        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00747     |
|    agent/train/n_updates             | 230         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 17.4        |
|    agent/rollout/ep_rew_wrapped_mean | 24.2        |
|    agent/time/fps                    | 4429        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 51200       |
|    agent/train/approx_kl             | 0.005813496 |
|    agent/train/clip_fraction         | 0.12        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.28       |
|    agent/train/explained_variance    | 0.525       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0183      |
|    agent/train/n_updates             | 240         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.4         |
|    agent/rollout/ep_rew_wrapped_mean | 26.2         |
|    agent/time/fps                    | 4348         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 53248        |
|    agent/train/approx_kl             | 0.0021085714 |
|    agent/train/clip_fraction         | 0.0365       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.2         |
|    agent/train/explained_variance    | 0.927        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00671      |
|    agent/train/n_updates             | 250          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.5         |
|    agent/rollout/ep_rew_wrapped_mean | 27           |
|    agent/time/fps                    | 4179         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 55296        |
|    agent/train/approx_kl             | 0.0074368557 |
|    agent/train/clip_fraction         | 0.0771       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.178       |
|    agent/train/explained_variance    | 0.98         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0122      |
|    agent/train/n_updates             | 260          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.6         |
|    agent/rollout/ep_rew_wrapped_mean | 27           |
|    agent/time/fps                    | 4141         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 57344        |
|    agent/train/approx_kl             | 0.0012303322 |
|    agent/train/clip_fraction         | 0.025        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.141       |
|    agent/train/explained_variance    | 0.976        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00685     |
|    agent/train/n_updates             | 270          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.3         |
|    agent/rollout/ep_rew_wrapped_mean | 27.1         |
|    agent/time/fps                    | 4346         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 59392        |
|    agent/train/approx_kl             | 0.0024766005 |
|    agent/train/clip_fraction         | 0.0382       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.126       |
|    agent/train/explained_variance    | 0.988        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00362     |
|    agent/train/n_updates             | 280          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.3         |
|    agent/rollout/ep_rew_wrapped_mean | 26.8         |
|    agent/time/fps                    | 4362         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 61440        |
|    agent/train/approx_kl             | 0.0015560181 |
|    agent/train/clip_fraction         | 0.0421       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.162       |
|    agent/train/explained_variance    | 0.971        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00929      |
|    agent/train/n_updates             | 290          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.2         |
|    agent/rollout/ep_rew_wrapped_mean | 26.9         |
|    agent/time/fps                    | 4358         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 63488        |
|    agent/train/approx_kl             | 0.0014631483 |
|    agent/train/clip_fraction         | 0.0421       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.155       |
|    agent/train/explained_variance    | 0.98         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0727       |
|    agent/train/n_updates             | 300          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.2        |
|    agent/rollout/ep_rew_wrapped_mean | 27.5        |
|    agent/time/fps                    | 4360        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 65536       |
|    agent/train/approx_kl             | 0.004030962 |
|    agent/train/clip_fraction         | 0.0504      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.15       |
|    agent/train/explained_variance    | 0.986       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0711      |
|    agent/train/n_updates             | 310         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.3         |
|    agent/rollout/ep_rew_wrapped_mean | 28           |
|    agent/time/fps                    | 4395         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 67584        |
|    agent/train/approx_kl             | 0.0012625506 |
|    agent/train/clip_fraction         | 0.0403       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.165       |
|    agent/train/explained_variance    | 0.986        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00219     |
|    agent/train/n_updates             | 320          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14           |
|    agent/rollout/ep_rew_wrapped_mean | 28.4         |
|    agent/time/fps                    | 4392         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 69632        |
|    agent/train/approx_kl             | 0.0054822247 |
|    agent/train/clip_fraction         | 0.0779       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.167       |
|    agent/train/explained_variance    | 0.967        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0128      |
|    agent/train/n_updates             | 330          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14           |
|    agent/rollout/ep_rew_wrapped_mean | 28.5         |
|    agent/time/fps                    | 4244         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 71680        |
|    agent/train/approx_kl             | 0.0027643563 |
|    agent/train/clip_fraction         | 0.059        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.198       |
|    agent/train/explained_variance    | 0.986        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00892     |
|    agent/train/n_updates             | 340          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.2        |
|    agent/rollout/ep_rew_wrapped_mean | 28.6        |
|    agent/time/fps                    | 4498        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 73728       |
|    agent/train/approx_kl             | 0.002697981 |
|    agent/train/clip_fraction         | 0.102       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.287      |
|    agent/train/explained_variance    | 0.984       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0844      |
|    agent/train/n_updates             | 350         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.2         |
|    agent/rollout/ep_rew_wrapped_mean | 28.7         |
|    agent/time/fps                    | 4250         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 75776        |
|    agent/train/approx_kl             | 0.0044414573 |
|    agent/train/clip_fraction         | 0.145        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.343       |
|    agent/train/explained_variance    | 0.983        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00381      |
|    agent/train/n_updates             | 360          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.2        |
|    agent/rollout/ep_rew_wrapped_mean | 29.2        |
|    agent/time/fps                    | 4105        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 77824       |
|    agent/train/approx_kl             | 0.004294427 |
|    agent/train/clip_fraction         | 0.133       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.396      |
|    agent/train/explained_variance    | 0.978       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00308    |
|    agent/train/n_updates             | 370         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.8         |
|    agent/rollout/ep_rew_wrapped_mean | 29.8         |
|    agent/time/fps                    | 4295         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 79872        |
|    agent/train/approx_kl             | 0.0024124817 |
|    agent/train/clip_fraction         | 0.122        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.368       |
|    agent/train/explained_variance    | 0.973        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0015      |
|    agent/train/n_updates             | 380          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.3         |
|    agent/rollout/ep_rew_wrapped_mean | 30.8         |
|    agent/time/fps                    | 4215         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 81920        |
|    agent/train/approx_kl             | 0.0055400673 |
|    agent/train/clip_fraction         | 0.142        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.454       |
|    agent/train/explained_variance    | 0.898        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0039      |
|    agent/train/n_updates             | 390          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.3         |
|    agent/rollout/ep_rew_wrapped_mean | 32.9         |
|    agent/time/fps                    | 4098         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 83968        |
|    agent/train/approx_kl             | 0.0035616993 |
|    agent/train/clip_fraction         | 0.123        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.377       |
|    agent/train/explained_variance    | 0.853        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0103       |
|    agent/train/n_updates             | 400          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.3         |
|    agent/rollout/ep_rew_wrapped_mean | 34.6         |
|    agent/time/fps                    | 4340         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 86016        |
|    agent/train/approx_kl             | 0.0025876798 |
|    agent/train/clip_fraction         | 0.075        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.371       |
|    agent/train/explained_variance    | 0.955        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00244      |
|    agent/train/n_updates             | 410          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.9         |
|    agent/rollout/ep_rew_wrapped_mean | 36.8         |
|    agent/time/fps                    | 4205         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 88064        |
|    agent/train/approx_kl             | 0.0029343036 |
|    agent/train/clip_fraction         | 0.116        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.456       |
|    agent/train/explained_variance    | 0.938        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00768      |
|    agent/train/n_updates             | 420          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
----------------------------------------------------
| raw/                                 |           |
|    agent/rollout/ep_len_mean         | 500       |
|    agent/rollout/ep_rew_mean         | 26        |
|    agent/rollout/ep_rew_wrapped_mean | 38.8      |
|    agent/time/fps                    | 4494      |
|    agent/time/iterations             | 1         |
|    agent/time/time_elapsed           | 0         |
|    agent/time/total_timesteps        | 90112     |
|    agent/train/approx_kl             | 0.0024195 |
|    agent/train/clip_fraction         | 0.128     |
|    agent/train/clip_range            | 0.1       |
|    agent/train/entropy_loss          | -0.447    |
|    agent/train/explained_variance    | 0.911     |
|    agent/train/learning_rate         | 0.002     |
|    agent/train/loss                  | 0.00332   |
|    agent/train/n_updates             | 430       |
|    agent/train/policy_gradient_loss  | -0.00215  |
|    agent/tra

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.8         |
|    agent/rollout/ep_rew_wrapped_mean | 41.5         |
|    agent/time/fps                    | 4432         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 92160        |
|    agent/train/approx_kl             | 0.0037746788 |
|    agent/train/clip_fraction         | 0.152        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.464       |
|    agent/train/explained_variance    | 0.964        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0108       |
|    agent/train/n_updates             | 440          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 37.8         |
|    agent/rollout/ep_rew_wrapped_mean | 46           |
|    agent/time/fps                    | 4398         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 94208        |
|    agent/train/approx_kl             | 0.0040082284 |
|    agent/train/clip_fraction         | 0.167        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.6         |
|    agent/train/explained_variance    | 0.98         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00318      |
|    agent/train/n_updates             | 450          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 44.4        |
|    agent/rollout/ep_rew_wrapped_mean | 49.7        |
|    agent/time/fps                    | 3834        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 96256       |
|    agent/train/approx_kl             | 0.003598316 |
|    agent/train/clip_fraction         | 0.152       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.592      |
|    agent/train/explained_variance    | 0.982       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00899    |
|    agent/train/n_updates             | 460         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 48.2        |
|    agent/rollout/ep_rew_wrapped_mean | 53.9        |
|    agent/time/fps                    | 4487        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 98304       |
|    agent/train/approx_kl             | 0.002853588 |
|    agent/train/clip_fraction         | 0.14        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.58       |
|    agent/train/explained_variance    | 0.985       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00605     |
|    agent/train/n_updates             | 470         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 52.1         |
|    agent/rollout/ep_rew_wrapped_mean | 57.8         |
|    agent/time/fps                    | 4497         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 100352       |
|    agent/train/approx_kl             | 0.0027313214 |
|    agent/train/clip_fraction         | 0.114        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.573       |
|    agent/train/explained_variance    | 0.985        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00214     |
|    agent/train/n_updates             | 480          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 56.7         |
|    agent/rollout/ep_rew_wrapped_mean | 62.2         |
|    agent/time/fps                    | 4037         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 102400       |
|    agent/train/approx_kl             | 0.0015245175 |
|    agent/train/clip_fraction         | 0.0758       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.592       |
|    agent/train/explained_variance    | 0.994        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00295     |
|    agent/train/n_updates             | 490          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 61.2         |
|    agent/rollout/ep_rew_wrapped_mean | 66.4         |
|    agent/time/fps                    | 4403         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 104448       |
|    agent/train/approx_kl             | 0.0021098065 |
|    agent/train/clip_fraction         | 0.113        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.57        |
|    agent/train/explained_variance    | 0.995        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.015       |
|    agent/train/n_updates             | 500          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 65.8         |
|    agent/rollout/ep_rew_wrapped_mean | 69.2         |
|    agent/time/fps                    | 4369         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 106496       |
|    agent/train/approx_kl             | 0.0031746633 |
|    agent/train/clip_fraction         | 0.111        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.582       |
|    agent/train/explained_variance    | 0.99         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.000694    |
|    agent/train/n_updates             | 510          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 70           |
|    agent/rollout/ep_rew_wrapped_mean | 71.3         |
|    agent/time/fps                    | 4503         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 108544       |
|    agent/train/approx_kl             | 0.0029217615 |
|    agent/train/clip_fraction         | 0.106        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.573       |
|    agent/train/explained_variance    | 0.99         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.000288    |
|    agent/train/n_updates             | 520          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 73.9         |
|    agent/rollout/ep_rew_wrapped_mean | 73.3         |
|    agent/time/fps                    | 4145         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 110592       |
|    agent/train/approx_kl             | 0.0021207645 |
|    agent/train/clip_fraction         | 0.138        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.581       |
|    agent/train/explained_variance    | 0.994        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00164      |
|    agent/train/n_updates             | 530          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 77.9        |
|    agent/rollout/ep_rew_wrapped_mean | 74.8        |
|    agent/time/fps                    | 4213        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 112640      |
|    agent/train/approx_kl             | 0.002565009 |
|    agent/train/clip_fraction         | 0.134       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.579      |
|    agent/train/explained_variance    | 0.996       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00964    |
|    agent/train/n_updates             | 540         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 81.8         |
|    agent/rollout/ep_rew_wrapped_mean | 76           |
|    agent/time/fps                    | 3228         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 114688       |
|    agent/train/approx_kl             | 0.0022472725 |
|    agent/train/clip_fraction         | 0.163        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.565       |
|    agent/train/explained_variance    | 0.996        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 6.45e-05     |
|    agent/train/n_updates             | 550          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 85.8         |
|    agent/rollout/ep_rew_wrapped_mean | 77.3         |
|    agent/time/fps                    | 4409         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 116736       |
|    agent/train/approx_kl             | 0.0024147322 |
|    agent/train/clip_fraction         | 0.16         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.576       |
|    agent/train/explained_variance    | 0.996        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00852     |
|    agent/train/n_updates             | 560          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 89           |
|    agent/rollout/ep_rew_wrapped_mean | 78.1         |
|    agent/time/fps                    | 4068         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 118784       |
|    agent/train/approx_kl             | 0.0029847894 |
|    agent/train/clip_fraction         | 0.145        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.572       |
|    agent/train/explained_variance    | 0.994        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00792     |
|    agent/train/n_updates             | 570          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 92.6         |
|    agent/rollout/ep_rew_wrapped_mean | 78.5         |
|    agent/time/fps                    | 4394         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 120832       |
|    agent/train/approx_kl             | 0.0032598926 |
|    agent/train/clip_fraction         | 0.15         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.577       |
|    agent/train/explained_variance    | 0.992        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00333     |
|    agent/train/n_updates             | 580          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 103 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 96.5         |
|    agent/rollout/ep_rew_wrapped_mean | 79.4         |
|    agent/time/fps                    | 4303         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 122880       |
|    agent/train/approx_kl             | 0.0025249082 |
|    agent/train/clip_fraction         | 0.105        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.576       |
|    agent/train/explained_variance    | 0.991        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.000449     |
|    agent/train/n_updates             | 590          |
|    agent/trai

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 100         |
|    agent/rollout/ep_rew_wrapped_mean | 79.9        |
|    agent/time/fps                    | 4208        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 124928      |
|    agent/train/approx_kl             | 0.004054511 |
|    agent/train/clip_fraction         | 0.171       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.576      |
|    agent/train/explained_variance    | 0.985       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0164     |
|    agent/train/n_updates             | 600         |
|    agent/train/policy_gradient_



VBox(children=(Label(value='0.162 MB of 0.162 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▂▁▁▁▁▂▃▅▆▇██████████████████████████████
time/fps,█▁▃▃▄▄▄▄▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅
train/approx_kl,▁▁▁▂▁▂▁▂▂▂▁▁▂▁▃▃▂▁▂▁▃▁▂▃▁▂▂▁▂▁▂▂▃▃▁▂▅▂█▁
train/clip_fraction,▁▄▅▆▅▅▄▅▆█▆█▆▅▅▆▅▅▅▄▄▃▄▅▄▃▄▄▃▃▃▄▅▅▃▄▅▃▅▃
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▁▂▃▅▅▅▆▅▃▄▅▆▆▇▇▇▇▇▇▇█▇██████████▇▇█▇██▇
train/explained_variance,▁▇▇▆▇█▇█████████████████████████████▇███
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,130.53
time/fps,2767.0
train/approx_kl,0.00373
train/clip_fraction,0.11934
train/clip_range,0.1
train/entropy_loss,-0.25347
train/explained_variance,0.99999
train/learning_rate,0.002


 40%|█████████████████▌                          | 2/5 [10:02<15:04, 301.57s/it]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011172877309016055, max=1.0…

Query schedule: [20, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Collecting 40 fragments (8000 transitions)
Requested 4800 transitions but only 0 in buffer. Sampling 4800 additional transitions.
Sampling 3200 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 20 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 35.8     |
|    agent/rollout/ep_rew_wrapped_mean | -29.4    |
|    agent/time/fps                    | 4020     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 35.8     |
|    agent/rollout/ep_rew_wrapped_mean | -29.4    |
|    agent/time/fps                    | 4.02e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.9         |
|    agent/rollout/ep_rew_wrapped_mean | -30.5        |
|    agent/time/fps                    | 2467         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0023638718 |
|    agent/train/clip_fraction         | 0.143        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.691       |
|    agent/train/explained_variance    | 0.169        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00473     |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 25.3        |
|    agent/rollout/ep_rew_wrapped_mean | -32         |
|    agent/time/fps                    | 4391        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 6144        |
|    agent/train/approx_kl             | 0.004105024 |
|    agent/train/clip_fraction         | 0.276       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.683      |
|    agent/train/explained_variance    | 0.342       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0314     |
|    agent/train/n_updates             | 20          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.8         |
|    agent/rollout/ep_rew_wrapped_mean | -35.7        |
|    agent/time/fps                    | 3803         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0029284046 |
|    agent/train/clip_fraction         | 0.144        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.666       |
|    agent/train/explained_variance    | -0.683       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0123       |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 22.1        |
|    agent/rollout/ep_rew_wrapped_mean | -36.7       |
|    agent/time/fps                    | 2525        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 10240       |
|    agent/train/approx_kl             | 0.003218552 |
|    agent/train/clip_fraction         | 0.148       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.659      |
|    agent/train/explained_variance    | 0.889       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.017      |
|    agent/train/n_updates             | 40          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.1         |
|    agent/rollout/ep_rew_wrapped_mean | -38.3        |
|    agent/time/fps                    | 4023         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0037381714 |
|    agent/train/clip_fraction         | 0.202        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.655       |
|    agent/train/explained_variance    | 0.866        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00325     |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.8         |
|    agent/rollout/ep_rew_wrapped_mean | -39          |
|    agent/time/fps                    | 3397         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0044301096 |
|    agent/train/clip_fraction         | 0.278        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.658       |
|    agent/train/explained_variance    | 0.925        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0121      |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.5         |
|    agent/rollout/ep_rew_wrapped_mean | -40          |
|    agent/time/fps                    | 4370         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0038033756 |
|    agent/train/clip_fraction         | 0.222        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.655       |
|    agent/train/explained_variance    | 0.854        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0152      |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.8         |
|    agent/rollout/ep_rew_wrapped_mean | -40.8        |
|    agent/time/fps                    | 2383         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0043208012 |
|    agent/train/clip_fraction         | 0.267        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.633       |
|    agent/train/explained_variance    | 0.79         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0114      |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 20.6        |
|    agent/rollout/ep_rew_wrapped_mean | -41.6       |
|    agent/time/fps                    | 4406        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 20480       |
|    agent/train/approx_kl             | 0.005315842 |
|    agent/train/clip_fraction         | 0.227       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.606      |
|    agent/train/explained_variance    | 0.548       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00638    |
|    agent/train/n_updates             | 90          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 20.2       |
|    agent/rollout/ep_rew_wrapped_mean | -44.3      |
|    agent/time/fps                    | 4324       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 22528      |
|    agent/train/approx_kl             | 0.00314432 |
|    agent/train/clip_fraction         | 0.116      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.598     |
|    agent/train/explained_variance    | 0.712      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.0114     |
|    agent/train/n_updates             | 100        |
|    agent/train/policy_gradient_loss  | -0.00181 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 19.7        |
|    agent/rollout/ep_rew_wrapped_mean | -46.1       |
|    agent/time/fps                    | 4173        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 24576       |
|    agent/train/approx_kl             | 0.006775398 |
|    agent/train/clip_fraction         | 0.323       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.597      |
|    agent/train/explained_variance    | 0.963       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0172     |
|    agent/train/n_updates             | 110         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 19.3        |
|    agent/rollout/ep_rew_wrapped_mean | -47.3       |
|    agent/time/fps                    | 4467        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 26624       |
|    agent/train/approx_kl             | 0.002834485 |
|    agent/train/clip_fraction         | 0.146       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.609      |
|    agent/train/explained_variance    | 0.859       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.018      |
|    agent/train/n_updates             | 120         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 18.7        |
|    agent/rollout/ep_rew_wrapped_mean | -48.4       |
|    agent/time/fps                    | 4478        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 28672       |
|    agent/train/approx_kl             | 0.005828565 |
|    agent/train/clip_fraction         | 0.235       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.586      |
|    agent/train/explained_variance    | 0.912       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00796    |
|    agent/train/n_updates             | 130         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 18.3        |
|    agent/rollout/ep_rew_wrapped_mean | -50.2       |
|    agent/time/fps                    | 3837        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 30720       |
|    agent/train/approx_kl             | 0.002764017 |
|    agent/train/clip_fraction         | 0.148       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.627      |
|    agent/train/explained_variance    | 0.892       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0204     |
|    agent/train/n_updates             | 140         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 17.8        |
|    agent/rollout/ep_rew_wrapped_mean | -51.4       |
|    agent/time/fps                    | 4409        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 32768       |
|    agent/train/approx_kl             | 0.002854576 |
|    agent/train/clip_fraction         | 0.136       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.63       |
|    agent/train/explained_variance    | 0.875       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0102      |
|    agent/train/n_updates             | 150         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17.4         |
|    agent/rollout/ep_rew_wrapped_mean | -50.6        |
|    agent/time/fps                    | 4483         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0021897517 |
|    agent/train/clip_fraction         | 0.106        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.632       |
|    agent/train/explained_variance    | 0.836        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00883     |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17.2         |
|    agent/rollout/ep_rew_wrapped_mean | -53.6        |
|    agent/time/fps                    | 4455         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 36864        |
|    agent/train/approx_kl             | 0.0028696018 |
|    agent/train/clip_fraction         | 0.158        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.617       |
|    agent/train/explained_variance    | 0.938        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0112      |
|    agent/train/n_updates             | 170          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.9         |
|    agent/rollout/ep_rew_wrapped_mean | -55.1        |
|    agent/time/fps                    | 3575         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 38912        |
|    agent/train/approx_kl             | 0.0019844032 |
|    agent/train/clip_fraction         | 0.107        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.612       |
|    agent/train/explained_variance    | 0.903        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0067      |
|    agent/train/n_updates             | 180          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.6         |
|    agent/rollout/ep_rew_wrapped_mean | -56.6        |
|    agent/time/fps                    | 4438         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 40960        |
|    agent/train/approx_kl             | 0.0024279458 |
|    agent/train/clip_fraction         | 0.0905       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.615       |
|    agent/train/explained_variance    | 0.902        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00747     |
|    agent/train/n_updates             | 190          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.3         |
|    agent/rollout/ep_rew_wrapped_mean | -57.7        |
|    agent/time/fps                    | 4345         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0046558892 |
|    agent/train/clip_fraction         | 0.143        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.593       |
|    agent/train/explained_variance    | 0.942        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00363     |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.2         |
|    agent/rollout/ep_rew_wrapped_mean | -58.8        |
|    agent/time/fps                    | 4335         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 45056        |
|    agent/train/approx_kl             | 0.0031003724 |
|    agent/train/clip_fraction         | 0.146        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.608       |
|    agent/train/explained_variance    | 0.846        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00185     |
|    agent/train/n_updates             | 210          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.2         |
|    agent/rollout/ep_rew_wrapped_mean | -60          |
|    agent/time/fps                    | 4422         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 47104        |
|    agent/train/approx_kl             | 0.0037997807 |
|    agent/train/clip_fraction         | 0.13         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.568       |
|    agent/train/explained_variance    | 0.956        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0102      |
|    agent/train/n_updates             | 220          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.1         |
|    agent/rollout/ep_rew_wrapped_mean | -61.4        |
|    agent/time/fps                    | 4459         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 49152        |
|    agent/train/approx_kl             | 0.0019691668 |
|    agent/train/clip_fraction         | 0.0863       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.582       |
|    agent/train/explained_variance    | 0.95         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00577     |
|    agent/train/n_updates             | 230          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.1         |
|    agent/rollout/ep_rew_wrapped_mean | -62.7        |
|    agent/time/fps                    | 4464         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 51200        |
|    agent/train/approx_kl             | 0.0030400485 |
|    agent/train/clip_fraction         | 0.146        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.554       |
|    agent/train/explained_variance    | 0.954        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00781     |
|    agent/train/n_updates             | 240          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.3         |
|    agent/rollout/ep_rew_wrapped_mean | -63.8        |
|    agent/time/fps                    | 4513         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 53248        |
|    agent/train/approx_kl             | 0.0020658872 |
|    agent/train/clip_fraction         | 0.0902       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.583       |
|    agent/train/explained_variance    | 0.973        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00397      |
|    agent/train/n_updates             | 250          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.2         |
|    agent/rollout/ep_rew_wrapped_mean | -65.4        |
|    agent/time/fps                    | 4509         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 55296        |
|    agent/train/approx_kl             | 0.0037860605 |
|    agent/train/clip_fraction         | 0.164        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.585       |
|    agent/train/explained_variance    | 0.935        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0125      |
|    agent/train/n_updates             | 260          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 15.4        |
|    agent/rollout/ep_rew_wrapped_mean | -66.2       |
|    agent/time/fps                    | 4237        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 57344       |
|    agent/train/approx_kl             | 0.002705464 |
|    agent/train/clip_fraction         | 0.105       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.537      |
|    agent/train/explained_variance    | 0.977       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0156     |
|    agent/train/n_updates             | 270         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.2         |
|    agent/rollout/ep_rew_wrapped_mean | -66.8        |
|    agent/time/fps                    | 3555         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 59392        |
|    agent/train/approx_kl             | 0.0019993903 |
|    agent/train/clip_fraction         | 0.0994       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.545       |
|    agent/train/explained_variance    | 0.778        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0032       |
|    agent/train/n_updates             | 280          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 15.2       |
|    agent/rollout/ep_rew_wrapped_mean | -66.5      |
|    agent/time/fps                    | 4485       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 61440      |
|    agent/train/approx_kl             | 0.00510817 |
|    agent/train/clip_fraction         | 0.137      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.508     |
|    agent/train/explained_variance    | 0.97       |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.00285   |
|    agent/train/n_updates             | 290        |
|    agent/train/policy_gradient_loss  | -0.00269 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.1         |
|    agent/rollout/ep_rew_wrapped_mean | -66.9        |
|    agent/time/fps                    | 4227         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 63488        |
|    agent/train/approx_kl             | 0.0024346826 |
|    agent/train/clip_fraction         | 0.104        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.529       |
|    agent/train/explained_variance    | 0.885        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00902     |
|    agent/train/n_updates             | 300          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.9        |
|    agent/rollout/ep_rew_wrapped_mean | -67.6       |
|    agent/time/fps                    | 4483        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 65536       |
|    agent/train/approx_kl             | 0.002653603 |
|    agent/train/clip_fraction         | 0.141       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.55       |
|    agent/train/explained_variance    | 0.967       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0177     |
|    agent/train/n_updates             | 310         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.8         |
|    agent/rollout/ep_rew_wrapped_mean | -68.7        |
|    agent/time/fps                    | 4310         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 67584        |
|    agent/train/approx_kl             | 0.0021412128 |
|    agent/train/clip_fraction         | 0.105        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.477       |
|    agent/train/explained_variance    | 0.952        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00404      |
|    agent/train/n_updates             | 320          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.6         |
|    agent/rollout/ep_rew_wrapped_mean | -70.1        |
|    agent/time/fps                    | 4449         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 69632        |
|    agent/train/approx_kl             | 0.0024473083 |
|    agent/train/clip_fraction         | 0.105        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.522       |
|    agent/train/explained_variance    | 0.926        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0124       |
|    agent/train/n_updates             | 330          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.4         |
|    agent/rollout/ep_rew_wrapped_mean | -72          |
|    agent/time/fps                    | 3216         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 71680        |
|    agent/train/approx_kl             | 0.0023179268 |
|    agent/train/clip_fraction         | 0.112        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.459       |
|    agent/train/explained_variance    | 0.968        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00287      |
|    agent/train/n_updates             | 340          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.5         |
|    agent/rollout/ep_rew_wrapped_mean | -73.9        |
|    agent/time/fps                    | 3877         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 73728        |
|    agent/train/approx_kl             | 0.0025365348 |
|    agent/train/clip_fraction         | 0.121        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.499       |
|    agent/train/explained_variance    | 0.96         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00658      |
|    agent/train/n_updates             | 350          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.5         |
|    agent/rollout/ep_rew_wrapped_mean | -75.7        |
|    agent/time/fps                    | 4212         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 75776        |
|    agent/train/approx_kl             | 0.0025567077 |
|    agent/train/clip_fraction         | 0.112        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.526       |
|    agent/train/explained_variance    | 0.982        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.104        |
|    agent/train/n_updates             | 360          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.5        |
|    agent/rollout/ep_rew_wrapped_mean | -77.6       |
|    agent/time/fps                    | 4384        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 77824       |
|    agent/train/approx_kl             | 0.003073072 |
|    agent/train/clip_fraction         | 0.158       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.489      |
|    agent/train/explained_variance    | 0.974       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00317    |
|    agent/train/n_updates             | 370         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.5        |
|    agent/rollout/ep_rew_wrapped_mean | -78.8       |
|    agent/time/fps                    | 4334        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 79872       |
|    agent/train/approx_kl             | 0.003760381 |
|    agent/train/clip_fraction         | 0.149       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.45       |
|    agent/train/explained_variance    | 0.973       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.000102   |
|    agent/train/n_updates             | 380         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.5        |
|    agent/rollout/ep_rew_wrapped_mean | -79.5       |
|    agent/time/fps                    | 4302        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 81920       |
|    agent/train/approx_kl             | 0.002722583 |
|    agent/train/clip_fraction         | 0.116       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.423      |
|    agent/train/explained_variance    | 0.967       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0142     |
|    agent/train/n_updates             | 390         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.6        |
|    agent/rollout/ep_rew_wrapped_mean | -80.2       |
|    agent/time/fps                    | 4388        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 83968       |
|    agent/train/approx_kl             | 0.004098284 |
|    agent/train/clip_fraction         | 0.144       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.416      |
|    agent/train/explained_variance    | 0.982       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00339     |
|    agent/train/n_updates             | 400         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.7        |
|    agent/rollout/ep_rew_wrapped_mean | -80.4       |
|    agent/time/fps                    | 4270        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 86016       |
|    agent/train/approx_kl             | 0.007673844 |
|    agent/train/clip_fraction         | 0.148       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.392      |
|    agent/train/explained_variance    | 0.98        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0218     |
|    agent/train/n_updates             | 410         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.7        |
|    agent/rollout/ep_rew_wrapped_mean | -81.7       |
|    agent/time/fps                    | 4305        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 88064       |
|    agent/train/approx_kl             | 0.004913278 |
|    agent/train/clip_fraction         | 0.132       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.425      |
|    agent/train/explained_variance    | 0.951       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00753     |
|    agent/train/n_updates             | 420         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.8         |
|    agent/rollout/ep_rew_wrapped_mean | -82.3        |
|    agent/time/fps                    | 4383         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 90112        |
|    agent/train/approx_kl             | 0.0043276222 |
|    agent/train/clip_fraction         | 0.161        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.391       |
|    agent/train/explained_variance    | 0.985        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.023       |
|    agent/train/n_updates             | 430          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.8         |
|    agent/rollout/ep_rew_wrapped_mean | -82.7        |
|    agent/time/fps                    | 4468         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 92160        |
|    agent/train/approx_kl             | 0.0037257434 |
|    agent/train/clip_fraction         | 0.123        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.408       |
|    agent/train/explained_variance    | 0.98         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0127       |
|    agent/train/n_updates             | 440          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 15          |
|    agent/rollout/ep_rew_wrapped_mean | -82.8       |
|    agent/time/fps                    | 4222        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 94208       |
|    agent/train/approx_kl             | 0.003924152 |
|    agent/train/clip_fraction         | 0.162       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.403      |
|    agent/train/explained_variance    | 0.991       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0125      |
|    agent/train/n_updates             | 450         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 15          |
|    agent/rollout/ep_rew_wrapped_mean | -82.3       |
|    agent/time/fps                    | 4248        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 96256       |
|    agent/train/approx_kl             | 0.006088685 |
|    agent/train/clip_fraction         | 0.175       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.39       |
|    agent/train/explained_variance    | 0.994       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.164       |
|    agent/train/n_updates             | 460         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.9         |
|    agent/rollout/ep_rew_wrapped_mean | -82.6        |
|    agent/time/fps                    | 4283         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 98304        |
|    agent/train/approx_kl             | 0.0035506855 |
|    agent/train/clip_fraction         | 0.103        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.362       |
|    agent/train/explained_variance    | 0.966        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0155       |
|    agent/train/n_updates             | 470          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.9         |
|    agent/rollout/ep_rew_wrapped_mean | -82.3        |
|    agent/time/fps                    | 4450         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 100352       |
|    agent/train/approx_kl             | 0.0037966678 |
|    agent/train/clip_fraction         | 0.11         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.305       |
|    agent/train/explained_variance    | 0.951        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0151       |
|    agent/train/n_updates             | 480          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.9         |
|    agent/rollout/ep_rew_wrapped_mean | -81.5        |
|    agent/time/fps                    | 4285         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 102400       |
|    agent/train/approx_kl             | 0.0033943825 |
|    agent/train/clip_fraction         | 0.0977       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.373       |
|    agent/train/explained_variance    | 0.966        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.028        |
|    agent/train/n_updates             | 490          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.9         |
|    agent/rollout/ep_rew_wrapped_mean | -80.7        |
|    agent/time/fps                    | 4020         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 104448       |
|    agent/train/approx_kl             | 0.0038998318 |
|    agent/train/clip_fraction         | 0.138        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.394       |
|    agent/train/explained_variance    | 0.986        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00366     |
|    agent/train/n_updates             | 500          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.7         |
|    agent/rollout/ep_rew_wrapped_mean | -79.9        |
|    agent/time/fps                    | 4051         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 106496       |
|    agent/train/approx_kl             | 0.0028774743 |
|    agent/train/clip_fraction         | 0.0977       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.348       |
|    agent/train/explained_variance    | 0.961        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.000568    |
|    agent/train/n_updates             | 510          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 14.3       |
|    agent/rollout/ep_rew_wrapped_mean | -78.8      |
|    agent/time/fps                    | 4230       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 108544     |
|    agent/train/approx_kl             | 0.00394982 |
|    agent/train/clip_fraction         | 0.105      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.38      |
|    agent/train/explained_variance    | 0.971      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.00356   |
|    agent/train/n_updates             | 520        |
|    agent/train/policy_gradient_loss  | -0.00141 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.5         |
|    agent/rollout/ep_rew_wrapped_mean | -77.4        |
|    agent/time/fps                    | 4340         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 110592       |
|    agent/train/approx_kl             | 0.0038636762 |
|    agent/train/clip_fraction         | 0.133        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.451       |
|    agent/train/explained_variance    | 0.913        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.023        |
|    agent/train/n_updates             | 530          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.6         |
|    agent/rollout/ep_rew_wrapped_mean | -75.9        |
|    agent/time/fps                    | 4334         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 112640       |
|    agent/train/approx_kl             | 0.0031979461 |
|    agent/train/clip_fraction         | 0.147        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.441       |
|    agent/train/explained_variance    | 0.945        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0118       |
|    agent/train/n_updates             | 540          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.7        |
|    agent/rollout/ep_rew_wrapped_mean | -74.7       |
|    agent/time/fps                    | 4138        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 114688      |
|    agent/train/approx_kl             | 0.006853402 |
|    agent/train/clip_fraction         | 0.189       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.368      |
|    agent/train/explained_variance    | 0.982       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00626    |
|    agent/train/n_updates             | 550         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.8         |
|    agent/rollout/ep_rew_wrapped_mean | -74          |
|    agent/time/fps                    | 4363         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 116736       |
|    agent/train/approx_kl             | 0.0039787204 |
|    agent/train/clip_fraction         | 0.107        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.325       |
|    agent/train/explained_variance    | 0.937        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00211     |
|    agent/train/n_updates             | 560          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.8        |
|    agent/rollout/ep_rew_wrapped_mean | -73.3       |
|    agent/time/fps                    | 3950        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 118784      |
|    agent/train/approx_kl             | 0.003944046 |
|    agent/train/clip_fraction         | 0.15        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.336      |
|    agent/train/explained_variance    | 0.961       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00244    |
|    agent/train/n_updates             | 570         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.7        |
|    agent/rollout/ep_rew_wrapped_mean | -72.5       |
|    agent/time/fps                    | 3875        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 120832      |
|    agent/train/approx_kl             | 0.012319871 |
|    agent/train/clip_fraction         | 0.207       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.361      |
|    agent/train/explained_variance    | 0.954       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0535      |
|    agent/train/n_updates             | 580         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 103 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.8        |
|    agent/rollout/ep_rew_wrapped_mean | -72         |
|    agent/time/fps                    | 3937        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 122880      |
|    agent/train/approx_kl             | 0.003402711 |
|    agent/train/clip_fraction         | 0.0896      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.376      |
|    agent/train/explained_variance    | 0.886       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0022      |
|    agent/train/n_updates             | 590         |
|    agent/train/policy_gradient

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.7         |
|    agent/rollout/ep_rew_wrapped_mean | -71.5        |
|    agent/time/fps                    | 4271         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 124928       |
|    agent/train/approx_kl             | 0.0040036477 |
|    agent/train/clip_fraction         | 0.0982       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.357       |
|    agent/train/explained_variance    | 0.956        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.000212     |
|    agent/train/n_updates             | 600          |
|    agent/train



VBox(children=(Label(value='0.110 MB of 0.110 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▅▂▂▂▁▁▂▄▅▆▇▇▇▇▇▇▇████████████████▇▇█████
time/fps,█▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/approx_kl,▂▂▁▃▂▂▂▂▃▂▂▃▂▂▂▃▂▂▂▃▂▂▃▄▄▆▁▂▃▃▆▄▅▄▃▅▄▄█▆
train/clip_fraction,▇█▄▇▆▃▃▂▂▂▂▂▂▂▂▂▁▂▁▂▁▂▁▂▁▂▁▁▁▁▂▂▂▁▁▁▁▁▁▂
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▁▂▃▄▆▇▇██▇▇███▇████████████████████████
train/explained_variance,▁▇▇▇████████████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,32.4
time/fps,2842.0
train/approx_kl,0.00873
train/clip_fraction,0.07695
train/clip_range,0.1
train/entropy_loss,-0.09801
train/explained_variance,0.99799
train/learning_rate,0.002


 60%|██████████████████████████▍                 | 3/5 [15:04<10:03, 301.73s/it]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011174985189508232, max=1.0…

Query schedule: [20, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Collecting 40 fragments (8000 transitions)
Requested 4800 transitions but only 0 in buffer. Sampling 4800 additional transitions.
Sampling 3200 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 20 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 18.5     |
|    agent/rollout/ep_rew_wrapped_mean | 250      |
|    agent/time/fps                    | 3948     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 18.5     |
|    agent/rollout/ep_rew_wrapped_mean | 250      |
|    agent/time/fps                    | 3.95e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.6         |
|    agent/rollout/ep_rew_wrapped_mean | 177          |
|    agent/time/fps                    | 4365         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0018001769 |
|    agent/train/clip_fraction         | 0.0539       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.692       |
|    agent/train/explained_variance    | 0.0735       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.017        |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.2         |
|    agent/rollout/ep_rew_wrapped_mean | 137          |
|    agent/time/fps                    | 4525         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0029192963 |
|    agent/train/clip_fraction         | 0.145        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.69        |
|    agent/train/explained_variance    | 0.77         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0139       |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 31.9       |
|    agent/rollout/ep_rew_wrapped_mean | 111        |
|    agent/time/fps                    | 4395       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 8192       |
|    agent/train/approx_kl             | 0.00335278 |
|    agent/train/clip_fraction         | 0.151      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.69      |
|    agent/train/explained_variance    | 0.577      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.00588    |
|    agent/train/n_updates             | 30         |
|    agent/train/policy_gradient_loss  | -0.0073  

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.2         |
|    agent/rollout/ep_rew_wrapped_mean | 95.6         |
|    agent/time/fps                    | 4111         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0030169543 |
|    agent/train/clip_fraction         | 0.192        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.683       |
|    agent/train/explained_variance    | 0.88         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00276     |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.2         |
|    agent/rollout/ep_rew_wrapped_mean | 83.2         |
|    agent/time/fps                    | 3265         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0012058034 |
|    agent/train/clip_fraction         | 0.0563       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.68        |
|    agent/train/explained_variance    | 0.802        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0106       |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 34           |
|    agent/rollout/ep_rew_wrapped_mean | 75.3         |
|    agent/time/fps                    | 2588         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0023173052 |
|    agent/train/clip_fraction         | 0.0929       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.677       |
|    agent/train/explained_variance    | 0.95         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0046       |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.2         |
|    agent/rollout/ep_rew_wrapped_mean | 68.8         |
|    agent/time/fps                    | 3392         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0030320294 |
|    agent/train/clip_fraction         | 0.194        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.679       |
|    agent/train/explained_variance    | 0.951        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00934     |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.5         |
|    agent/rollout/ep_rew_wrapped_mean | 64.7         |
|    agent/time/fps                    | 4010         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0029980384 |
|    agent/train/clip_fraction         | 0.136        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.677       |
|    agent/train/explained_variance    | 0.823        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0121       |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.2         |
|    agent/rollout/ep_rew_wrapped_mean | 61.5         |
|    agent/time/fps                    | 4217         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0029144646 |
|    agent/train/clip_fraction         | 0.139        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.678       |
|    agent/train/explained_variance    | 0.702        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00881      |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.1         |
|    agent/rollout/ep_rew_wrapped_mean | 57.7         |
|    agent/time/fps                    | 3891         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0038455103 |
|    agent/train/clip_fraction         | 0.251        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.665       |
|    agent/train/explained_variance    | 0.943        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0137      |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 33.3        |
|    agent/rollout/ep_rew_wrapped_mean | 55.5        |
|    agent/time/fps                    | 3676        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 24576       |
|    agent/train/approx_kl             | 0.005171016 |
|    agent/train/clip_fraction         | 0.239       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.647      |
|    agent/train/explained_variance    | 0.901       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0085     |
|    agent/train/n_updates             | 110         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 32.8        |
|    agent/rollout/ep_rew_wrapped_mean | 53.1        |
|    agent/time/fps                    | 3987        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 26624       |
|    agent/train/approx_kl             | 0.003920542 |
|    agent/train/clip_fraction         | 0.167       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.642      |
|    agent/train/explained_variance    | 0.838       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00183    |
|    agent/train/n_updates             | 120         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.1         |
|    agent/rollout/ep_rew_wrapped_mean | 51.1         |
|    agent/time/fps                    | 4481         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 28672        |
|    agent/train/approx_kl             | 0.0039438964 |
|    agent/train/clip_fraction         | 0.228        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.634       |
|    agent/train/explained_variance    | 0.872        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00172      |
|    agent/train/n_updates             | 130          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 31.7        |
|    agent/rollout/ep_rew_wrapped_mean | 43.2        |
|    agent/time/fps                    | 3445        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 30720       |
|    agent/train/approx_kl             | 0.002970642 |
|    agent/train/clip_fraction         | 0.148       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.622      |
|    agent/train/explained_variance    | 0.751       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00416     |
|    agent/train/n_updates             | 140         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31           |
|    agent/rollout/ep_rew_wrapped_mean | 28           |
|    agent/time/fps                    | 4246         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0031583062 |
|    agent/train/clip_fraction         | 0.132        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.611       |
|    agent/train/explained_variance    | 0.92         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0116       |
|    agent/train/n_updates             | 150          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31           |
|    agent/rollout/ep_rew_wrapped_mean | 17.3         |
|    agent/time/fps                    | 4493         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0031801043 |
|    agent/train/clip_fraction         | 0.184        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.623       |
|    agent/train/explained_variance    | 0.955        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00899     |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.4         |
|    agent/rollout/ep_rew_wrapped_mean | 6.27         |
|    agent/time/fps                    | 4236         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 36864        |
|    agent/train/approx_kl             | 0.0027909442 |
|    agent/train/clip_fraction         | 0.122        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.605       |
|    agent/train/explained_variance    | 0.923        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0116       |
|    agent/train/n_updates             | 170          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 30.1        |
|    agent/rollout/ep_rew_wrapped_mean | 6.84        |
|    agent/time/fps                    | 4249        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 38912       |
|    agent/train/approx_kl             | 0.003097979 |
|    agent/train/clip_fraction         | 0.179       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.613      |
|    agent/train/explained_variance    | 0.949       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00239     |
|    agent/train/n_updates             | 180         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29.6        |
|    agent/rollout/ep_rew_wrapped_mean | 6.84        |
|    agent/time/fps                    | 4494        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 40960       |
|    agent/train/approx_kl             | 0.003286294 |
|    agent/train/clip_fraction         | 0.214       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.598      |
|    agent/train/explained_variance    | 0.953       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0162      |
|    agent/train/n_updates             | 190         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.5         |
|    agent/rollout/ep_rew_wrapped_mean | 6.14         |
|    agent/time/fps                    | 4188         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0069680703 |
|    agent/train/clip_fraction         | 0.333        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.589       |
|    agent/train/explained_variance    | 0.948        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.000278     |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29.2        |
|    agent/rollout/ep_rew_wrapped_mean | 4.94        |
|    agent/time/fps                    | 4101        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 45056       |
|    agent/train/approx_kl             | 0.006729765 |
|    agent/train/clip_fraction         | 0.335       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.581      |
|    agent/train/explained_variance    | 0.942       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00187    |
|    agent/train/n_updates             | 210         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29           |
|    agent/rollout/ep_rew_wrapped_mean | 3.45         |
|    agent/time/fps                    | 4467         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 47104        |
|    agent/train/approx_kl             | 0.0066359118 |
|    agent/train/clip_fraction         | 0.348        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.59        |
|    agent/train/explained_variance    | 0.954        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00473      |
|    agent/train/n_updates             | 220          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.7         |
|    agent/rollout/ep_rew_wrapped_mean | 2.47         |
|    agent/time/fps                    | 4132         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 49152        |
|    agent/train/approx_kl             | 0.0073500625 |
|    agent/train/clip_fraction         | 0.435        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.58        |
|    agent/train/explained_variance    | 0.968        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00501     |
|    agent/train/n_updates             | 230          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 28.9        |
|    agent/rollout/ep_rew_wrapped_mean | 1.07        |
|    agent/time/fps                    | 3879        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 51200       |
|    agent/train/approx_kl             | 0.005457877 |
|    agent/train/clip_fraction         | 0.321       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.567      |
|    agent/train/explained_variance    | 0.945       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00985     |
|    agent/train/n_updates             | 240         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 29.7       |
|    agent/rollout/ep_rew_wrapped_mean | 0.419      |
|    agent/time/fps                    | 4418       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 53248      |
|    agent/train/approx_kl             | 0.00493526 |
|    agent/train/clip_fraction         | 0.319      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.587     |
|    agent/train/explained_variance    | 0.921      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.0242     |
|    agent/train/n_updates             | 250        |
|    agent/train/policy_gradient_loss  | -0.0129  

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.2         |
|    agent/rollout/ep_rew_wrapped_mean | -1.32        |
|    agent/time/fps                    | 2938         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 55296        |
|    agent/train/approx_kl             | 0.0070255543 |
|    agent/train/clip_fraction         | 0.36         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.585       |
|    agent/train/explained_variance    | 0.939        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0398       |
|    agent/train/n_updates             | 260          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29.8        |
|    agent/rollout/ep_rew_wrapped_mean | -3.45       |
|    agent/time/fps                    | 4352        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 57344       |
|    agent/train/approx_kl             | 0.005607071 |
|    agent/train/clip_fraction         | 0.287       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.584      |
|    agent/train/explained_variance    | 0.952       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00675     |
|    agent/train/n_updates             | 270         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.7         |
|    agent/rollout/ep_rew_wrapped_mean | -5.46        |
|    agent/time/fps                    | 4240         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 59392        |
|    agent/train/approx_kl             | 0.0053345524 |
|    agent/train/clip_fraction         | 0.335        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.597       |
|    agent/train/explained_variance    | 0.938        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00616     |
|    agent/train/n_updates             | 280          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.6         |
|    agent/rollout/ep_rew_wrapped_mean | -7.34        |
|    agent/time/fps                    | 4354         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 61440        |
|    agent/train/approx_kl             | 0.0044762692 |
|    agent/train/clip_fraction         | 0.305        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.592       |
|    agent/train/explained_variance    | 0.961        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00928     |
|    agent/train/n_updates             | 290          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 28.3       |
|    agent/rollout/ep_rew_wrapped_mean | -9.97      |
|    agent/time/fps                    | 4099       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 63488      |
|    agent/train/approx_kl             | 0.00428595 |
|    agent/train/clip_fraction         | 0.181      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.593     |
|    agent/train/explained_variance    | 0.965      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.00369   |
|    agent/train/n_updates             | 300        |
|    agent/train/policy_gradient_loss  | -0.00384 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.5         |
|    agent/rollout/ep_rew_wrapped_mean | -12          |
|    agent/time/fps                    | 4119         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 65536        |
|    agent/train/approx_kl             | 0.0042206324 |
|    agent/train/clip_fraction         | 0.177        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.576       |
|    agent/train/explained_variance    | 0.981        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00402     |
|    agent/train/n_updates             | 310          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 27.5       |
|    agent/rollout/ep_rew_wrapped_mean | -13.5      |
|    agent/time/fps                    | 4367       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 67584      |
|    agent/train/approx_kl             | 0.00630732 |
|    agent/train/clip_fraction         | 0.293      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.568     |
|    agent/train/explained_variance    | 0.955      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.0212    |
|    agent/train/n_updates             | 320        |
|    agent/train/policy_gradient_loss  | -0.00972 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.4         |
|    agent/rollout/ep_rew_wrapped_mean | -15.6        |
|    agent/time/fps                    | 4341         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 69632        |
|    agent/train/approx_kl             | 0.0039634025 |
|    agent/train/clip_fraction         | 0.131        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.556       |
|    agent/train/explained_variance    | 0.896        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00797     |
|    agent/train/n_updates             | 330          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.8         |
|    agent/rollout/ep_rew_wrapped_mean | -15.7        |
|    agent/time/fps                    | 4138         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 71680        |
|    agent/train/approx_kl             | 0.0053113536 |
|    agent/train/clip_fraction         | 0.264        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.527       |
|    agent/train/explained_variance    | 0.977        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00592     |
|    agent/train/n_updates             | 340          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 28.1        |
|    agent/rollout/ep_rew_wrapped_mean | -16.9       |
|    agent/time/fps                    | 4367        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 73728       |
|    agent/train/approx_kl             | 0.008265538 |
|    agent/train/clip_fraction         | 0.286       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.482      |
|    agent/train/explained_variance    | 0.988       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0206     |
|    agent/train/n_updates             | 350         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 28.2        |
|    agent/rollout/ep_rew_wrapped_mean | -18.2       |
|    agent/time/fps                    | 4287        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 75776       |
|    agent/train/approx_kl             | 0.002782022 |
|    agent/train/clip_fraction         | 0.153       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.548      |
|    agent/train/explained_variance    | 0.752       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0377      |
|    agent/train/n_updates             | 360         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 28.6        |
|    agent/rollout/ep_rew_wrapped_mean | -18.4       |
|    agent/time/fps                    | 3480        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 77824       |
|    agent/train/approx_kl             | 0.010207217 |
|    agent/train/clip_fraction         | 0.169       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.41       |
|    agent/train/explained_variance    | 0.949       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0245     |
|    agent/train/n_updates             | 370         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 28.6        |
|    agent/rollout/ep_rew_wrapped_mean | -18.1       |
|    agent/time/fps                    | 4223        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 79872       |
|    agent/train/approx_kl             | 0.011606585 |
|    agent/train/clip_fraction         | 0.164       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.314      |
|    agent/train/explained_variance    | 0.989       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.012      |
|    agent/train/n_updates             | 380         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.7         |
|    agent/rollout/ep_rew_wrapped_mean | -18.1        |
|    agent/time/fps                    | 3856         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 81920        |
|    agent/train/approx_kl             | 0.0049767056 |
|    agent/train/clip_fraction         | 0.155        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.299       |
|    agent/train/explained_variance    | 0.98         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0142       |
|    agent/train/n_updates             | 390          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 28.7        |
|    agent/rollout/ep_rew_wrapped_mean | -17.5       |
|    agent/time/fps                    | 4255        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 83968       |
|    agent/train/approx_kl             | 0.008601745 |
|    agent/train/clip_fraction         | 0.138       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.348      |
|    agent/train/explained_variance    | 0.883       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0124      |
|    agent/train/n_updates             | 400         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.5         |
|    agent/rollout/ep_rew_wrapped_mean | -16.6        |
|    agent/time/fps                    | 3585         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 86016        |
|    agent/train/approx_kl             | 0.0027996881 |
|    agent/train/clip_fraction         | 0.13         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.363       |
|    agent/train/explained_variance    | 0.911        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00802     |
|    agent/train/n_updates             | 410          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.7         |
|    agent/rollout/ep_rew_wrapped_mean | -15.7        |
|    agent/time/fps                    | 4010         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 88064        |
|    agent/train/approx_kl             | 0.0030891253 |
|    agent/train/clip_fraction         | 0.15         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.417       |
|    agent/train/explained_variance    | 0.957        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00681      |
|    agent/train/n_updates             | 420          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 28.6        |
|    agent/rollout/ep_rew_wrapped_mean | -16.5       |
|    agent/time/fps                    | 1192        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 90112       |
|    agent/train/approx_kl             | 0.008103176 |
|    agent/train/clip_fraction         | 0.179       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.471      |
|    agent/train/explained_variance    | 0.873       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00169    |
|    agent/train/n_updates             | 430         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 28.6        |
|    agent/rollout/ep_rew_wrapped_mean | -18.5       |
|    agent/time/fps                    | 1751        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 92160       |
|    agent/train/approx_kl             | 0.010430707 |
|    agent/train/clip_fraction         | 0.219       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.436      |
|    agent/train/explained_variance    | 0.9         |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0168      |
|    agent/train/n_updates             | 440         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.7         |
|    agent/rollout/ep_rew_wrapped_mean | -19.8        |
|    agent/time/fps                    | 3662         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 94208        |
|    agent/train/approx_kl             | 0.0020532142 |
|    agent/train/clip_fraction         | 0.0915       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.442       |
|    agent/train/explained_variance    | 0.971        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00423     |
|    agent/train/n_updates             | 450          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.6         |
|    agent/rollout/ep_rew_wrapped_mean | -19.9        |
|    agent/time/fps                    | 4062         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 96256        |
|    agent/train/approx_kl             | 0.0056781224 |
|    agent/train/clip_fraction         | 0.18         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.425       |
|    agent/train/explained_variance    | 0.963        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0101       |
|    agent/train/n_updates             | 460          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.1         |
|    agent/rollout/ep_rew_wrapped_mean | -20.4        |
|    agent/time/fps                    | 2994         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 98304        |
|    agent/train/approx_kl             | 0.0024671196 |
|    agent/train/clip_fraction         | 0.0981       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.423       |
|    agent/train/explained_variance    | 0.957        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0139      |
|    agent/train/n_updates             | 470          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.2         |
|    agent/rollout/ep_rew_wrapped_mean | -21.1        |
|    agent/time/fps                    | 3649         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 100352       |
|    agent/train/approx_kl             | 0.0033086312 |
|    agent/train/clip_fraction         | 0.131        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.426       |
|    agent/train/explained_variance    | 0.972        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00696     |
|    agent/train/n_updates             | 480          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 27.7        |
|    agent/rollout/ep_rew_wrapped_mean | -20.9       |
|    agent/time/fps                    | 4306        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 102400      |
|    agent/train/approx_kl             | 0.002740438 |
|    agent/train/clip_fraction         | 0.0967      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.403      |
|    agent/train/explained_variance    | 0.975       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0156     |
|    agent/train/n_updates             | 490         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.8         |
|    agent/rollout/ep_rew_wrapped_mean | -21          |
|    agent/time/fps                    | 3551         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 104448       |
|    agent/train/approx_kl             | 0.0037516763 |
|    agent/train/clip_fraction         | 0.147        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.507       |
|    agent/train/explained_variance    | 0.946        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00643      |
|    agent/train/n_updates             | 500          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 26          |
|    agent/rollout/ep_rew_wrapped_mean | -21.5       |
|    agent/time/fps                    | 3911        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 106496      |
|    agent/train/approx_kl             | 0.003022196 |
|    agent/train/clip_fraction         | 0.112       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.424      |
|    agent/train/explained_variance    | 0.979       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00779     |
|    agent/train/n_updates             | 510         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26           |
|    agent/rollout/ep_rew_wrapped_mean | -22.7        |
|    agent/time/fps                    | 4336         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 108544       |
|    agent/train/approx_kl             | 0.0056963367 |
|    agent/train/clip_fraction         | 0.17         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.499       |
|    agent/train/explained_variance    | 0.974        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00469     |
|    agent/train/n_updates             | 520          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 26         |
|    agent/rollout/ep_rew_wrapped_mean | -22.7      |
|    agent/time/fps                    | 4449       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 110592     |
|    agent/train/approx_kl             | 0.00423137 |
|    agent/train/clip_fraction         | 0.22       |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.476     |
|    agent/train/explained_variance    | 0.982      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.00669    |
|    agent/train/n_updates             | 530        |
|    agent/train/policy_gradient_loss  | -0.00749 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 25.3        |
|    agent/rollout/ep_rew_wrapped_mean | -23.3       |
|    agent/time/fps                    | 4119        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 112640      |
|    agent/train/approx_kl             | 0.008210919 |
|    agent/train/clip_fraction         | 0.198       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.428      |
|    agent/train/explained_variance    | 0.962       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.000605   |
|    agent/train/n_updates             | 540         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 25.5        |
|    agent/rollout/ep_rew_wrapped_mean | -23.8       |
|    agent/time/fps                    | 4375        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 114688      |
|    agent/train/approx_kl             | 0.005260653 |
|    agent/train/clip_fraction         | 0.218       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.468      |
|    agent/train/explained_variance    | 0.96        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00815    |
|    agent/train/n_updates             | 550         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 25.8        |
|    agent/rollout/ep_rew_wrapped_mean | -23.8       |
|    agent/time/fps                    | 3278        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 116736      |
|    agent/train/approx_kl             | 0.005576783 |
|    agent/train/clip_fraction         | 0.202       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.482      |
|    agent/train/explained_variance    | 0.986       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.027      |
|    agent/train/n_updates             | 560         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.6         |
|    agent/rollout/ep_rew_wrapped_mean | -24.1        |
|    agent/time/fps                    | 4463         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 118784       |
|    agent/train/approx_kl             | 0.0061524184 |
|    agent/train/clip_fraction         | 0.226        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.454       |
|    agent/train/explained_variance    | 0.978        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00583      |
|    agent/train/n_updates             | 570          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.8         |
|    agent/rollout/ep_rew_wrapped_mean | -23.8        |
|    agent/time/fps                    | 4235         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 120832       |
|    agent/train/approx_kl             | 0.0054477565 |
|    agent/train/clip_fraction         | 0.22         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.481       |
|    agent/train/explained_variance    | 0.978        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.014       |
|    agent/train/n_updates             | 580          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 103 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.8         |
|    agent/rollout/ep_rew_wrapped_mean | -23.1        |
|    agent/time/fps                    | 4110         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 122880       |
|    agent/train/approx_kl             | 0.0035389764 |
|    agent/train/clip_fraction         | 0.179        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.438       |
|    agent/train/explained_variance    | 0.959        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0164       |
|    agent/train/n_updates             | 590          |
|    agent/trai

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 23          |
|    agent/rollout/ep_rew_wrapped_mean | -22.2       |
|    agent/time/fps                    | 4368        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 124928      |
|    agent/train/approx_kl             | 0.006744908 |
|    agent/train/clip_fraction         | 0.182       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.41       |
|    agent/train/explained_variance    | 0.977       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0201     |
|    agent/train/n_updates             | 600         |
|    agent/train/policy_gradient_



VBox(children=(Label(value='0.110 MB of 0.110 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,█▅▃▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁
time/fps,▁▅▆▆▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇████████████████
train/approx_kl,▂▅▂▂▂▁▂▁▃▂▂▂▃▃▂▃▃▂▅▂▅▅▂▂▄▄▆▃▁▆▅▁▅▅▅▃█▂▂▂
train/clip_fraction,▅█▄▄▃▃▄▁▂▄▂▄▄▃▃▃▄▂▃▂▃▃▂▂▁▃▁▁▁▃▄▂▂▂▂▂▃▁▁▁
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▄▄▆▆▅▇▇▆▇▅▆▆▆▆▆▇▇▆▇▇█▇█▇███▇▇▇█▇██████
train/explained_variance,▁▇████████████▆███▇█████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,9.37
time/fps,2835.0
train/approx_kl,0.00445
train/clip_fraction,0.06025
train/clip_range,0.1
train/entropy_loss,-0.07839
train/explained_variance,0.99973
train/learning_rate,0.002


 80%|███████████████████████████████████▏        | 4/5 [20:18<05:06, 306.49s/it]

Query schedule: [20, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Collecting 40 fragments (8000 transitions)
Requested 4800 transitions but only 0 in buffer. Sampling 4800 additional transitions.
Sampling 3200 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 20 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 31.8     |
|    agent/rollout/ep_rew_wrapped_mean | 1.53     |
|    agent/time/fps                    | 4494     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 31.8     |
|    agent/rollout/ep_rew_wrapped_mean | 1.53     |
|    agent/time/fps                    | 4.49e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 34.1        |
|    agent/rollout/ep_rew_wrapped_mean | -13.1       |
|    agent/time/fps                    | 4070        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 4096        |
|    agent/train/approx_kl             | 0.002466042 |
|    agent/train/clip_fraction         | 0.0604      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.691      |
|    agent/train/explained_variance    | 0.0497      |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00917     |
|    agent/train/n_updates             | 10          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.4         |
|    agent/rollout/ep_rew_wrapped_mean | -23          |
|    agent/time/fps                    | 4245         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0012145946 |
|    agent/train/clip_fraction         | 0.0308       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.69        |
|    agent/train/explained_variance    | -0.104       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0117       |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 35.2        |
|    agent/rollout/ep_rew_wrapped_mean | -30         |
|    agent/time/fps                    | 4482        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 8192        |
|    agent/train/approx_kl             | 0.003062643 |
|    agent/train/clip_fraction         | 0.172       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.689      |
|    agent/train/explained_variance    | -0.0609     |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00859     |
|    agent/train/n_updates             | 30          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.8         |
|    agent/rollout/ep_rew_wrapped_mean | -33.8        |
|    agent/time/fps                    | 4464         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0024771956 |
|    agent/train/clip_fraction         | 0.15         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.684       |
|    agent/train/explained_variance    | -0.109       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00347      |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 33.1        |
|    agent/rollout/ep_rew_wrapped_mean | -38         |
|    agent/time/fps                    | 4445        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 12288       |
|    agent/train/approx_kl             | 0.003086279 |
|    agent/train/clip_fraction         | 0.114       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.673      |
|    agent/train/explained_variance    | 0.524       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0671      |
|    agent/train/n_updates             | 50          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 32          |
|    agent/rollout/ep_rew_wrapped_mean | -41.1       |
|    agent/time/fps                    | 4536        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 14336       |
|    agent/train/approx_kl             | 0.002371261 |
|    agent/train/clip_fraction         | 0.117       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.663      |
|    agent/train/explained_variance    | 0.871       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0564      |
|    agent/train/n_updates             | 60          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 34           |
|    agent/rollout/ep_rew_wrapped_mean | -42.4        |
|    agent/time/fps                    | 4424         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0034072315 |
|    agent/train/clip_fraction         | 0.159        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.653       |
|    agent/train/explained_variance    | 0.94         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0366       |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.8         |
|    agent/rollout/ep_rew_wrapped_mean | -43.9        |
|    agent/time/fps                    | 4508         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0011141098 |
|    agent/train/clip_fraction         | 0.0482       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.648       |
|    agent/train/explained_variance    | 0.963        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0669       |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.3         |
|    agent/rollout/ep_rew_wrapped_mean | -46.9        |
|    agent/time/fps                    | 4424         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0026159317 |
|    agent/train/clip_fraction         | 0.138        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.644       |
|    agent/train/explained_variance    | 0.974        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00308      |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.1         |
|    agent/rollout/ep_rew_wrapped_mean | -46.5        |
|    agent/time/fps                    | 4280         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0028149635 |
|    agent/train/clip_fraction         | 0.134        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.645       |
|    agent/train/explained_variance    | 0.978        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0096      |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.2         |
|    agent/rollout/ep_rew_wrapped_mean | -43.8        |
|    agent/time/fps                    | 4442         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0024603652 |
|    agent/train/clip_fraction         | 0.0983       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.619       |
|    agent/train/explained_variance    | 0.919        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00582      |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.3         |
|    agent/rollout/ep_rew_wrapped_mean | -42.4        |
|    agent/time/fps                    | 4279         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 26624        |
|    agent/train/approx_kl             | 0.0025524155 |
|    agent/train/clip_fraction         | 0.0694       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.608       |
|    agent/train/explained_variance    | 0.875        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0135       |
|    agent/train/n_updates             | 120          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 31          |
|    agent/rollout/ep_rew_wrapped_mean | -40.5       |
|    agent/time/fps                    | 4362        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 28672       |
|    agent/train/approx_kl             | 0.002150202 |
|    agent/train/clip_fraction         | 0.154       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.633      |
|    agent/train/explained_variance    | 0.882       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00287     |
|    agent/train/n_updates             | 130         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31           |
|    agent/rollout/ep_rew_wrapped_mean | -40          |
|    agent/time/fps                    | 4419         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0027025668 |
|    agent/train/clip_fraction         | 0.144        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.599       |
|    agent/train/explained_variance    | 0.956        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00315      |
|    agent/train/n_updates             | 140          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 31.3        |
|    agent/rollout/ep_rew_wrapped_mean | -42         |
|    agent/time/fps                    | 4415        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 32768       |
|    agent/train/approx_kl             | 0.006476253 |
|    agent/train/clip_fraction         | 0.183       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.558      |
|    agent/train/explained_variance    | 0.961       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00725    |
|    agent/train/n_updates             | 150         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 31.9       |
|    agent/rollout/ep_rew_wrapped_mean | -39.8      |
|    agent/time/fps                    | 4417       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 34816      |
|    agent/train/approx_kl             | 0.00623273 |
|    agent/train/clip_fraction         | 0.191      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.523     |
|    agent/train/explained_variance    | 0.959      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.0035     |
|    agent/train/n_updates             | 160        |
|    agent/train/policy_gradient_loss  | -0.00977 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 31.8        |
|    agent/rollout/ep_rew_wrapped_mean | -44.1       |
|    agent/time/fps                    | 4372        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 36864       |
|    agent/train/approx_kl             | 0.005891945 |
|    agent/train/clip_fraction         | 0.09        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.49       |
|    agent/train/explained_variance    | 0.961       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0898      |
|    agent/train/n_updates             | 170         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 31.6        |
|    agent/rollout/ep_rew_wrapped_mean | -42.2       |
|    agent/time/fps                    | 4504        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 38912       |
|    agent/train/approx_kl             | 0.008551619 |
|    agent/train/clip_fraction         | 0.149       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.409      |
|    agent/train/explained_variance    | 0.993       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0117     |
|    agent/train/n_updates             | 180         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 31.8        |
|    agent/rollout/ep_rew_wrapped_mean | -40.1       |
|    agent/time/fps                    | 4340        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 40960       |
|    agent/train/approx_kl             | 0.004383395 |
|    agent/train/clip_fraction         | 0.163       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.471      |
|    agent/train/explained_variance    | 0.986       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00959     |
|    agent/train/n_updates             | 190         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 31.9        |
|    agent/rollout/ep_rew_wrapped_mean | -38         |
|    agent/time/fps                    | 4475        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 43008       |
|    agent/train/approx_kl             | 0.005605444 |
|    agent/train/clip_fraction         | 0.116       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.343      |
|    agent/train/explained_variance    | 0.997       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00385     |
|    agent/train/n_updates             | 200         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 32.2        |
|    agent/rollout/ep_rew_wrapped_mean | -35.5       |
|    agent/time/fps                    | 4158        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 45056       |
|    agent/train/approx_kl             | 0.002864881 |
|    agent/train/clip_fraction         | 0.0807      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.317      |
|    agent/train/explained_variance    | 0.992       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00234    |
|    agent/train/n_updates             | 210         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.1         |
|    agent/rollout/ep_rew_wrapped_mean | -33.7        |
|    agent/time/fps                    | 3571         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 47104        |
|    agent/train/approx_kl             | 0.0033884055 |
|    agent/train/clip_fraction         | 0.0884       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.284       |
|    agent/train/explained_variance    | 0.98         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00578     |
|    agent/train/n_updates             | 220          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.2         |
|    agent/rollout/ep_rew_wrapped_mean | -31.4        |
|    agent/time/fps                    | 4433         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 49152        |
|    agent/train/approx_kl             | 0.0008511256 |
|    agent/train/clip_fraction         | 0.0621       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.239       |
|    agent/train/explained_variance    | 0.994        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00681     |
|    agent/train/n_updates             | 230          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 31.7        |
|    agent/rollout/ep_rew_wrapped_mean | -29.9       |
|    agent/time/fps                    | 4289        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 51200       |
|    agent/train/approx_kl             | 0.008419103 |
|    agent/train/clip_fraction         | 0.0849      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.244      |
|    agent/train/explained_variance    | 0.989       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0121      |
|    agent/train/n_updates             | 240         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.9         |
|    agent/rollout/ep_rew_wrapped_mean | -28.8        |
|    agent/time/fps                    | 4022         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 53248        |
|    agent/train/approx_kl             | 0.0036454892 |
|    agent/train/clip_fraction         | 0.0891       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.253       |
|    agent/train/explained_variance    | 0.957        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00332      |
|    agent/train/n_updates             | 250          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.9         |
|    agent/rollout/ep_rew_wrapped_mean | -28.8        |
|    agent/time/fps                    | 4514         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 55296        |
|    agent/train/approx_kl             | 0.0036267529 |
|    agent/train/clip_fraction         | 0.0937       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.341       |
|    agent/train/explained_variance    | 0.961        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0203       |
|    agent/train/n_updates             | 260          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 31.8        |
|    agent/rollout/ep_rew_wrapped_mean | -27.4       |
|    agent/time/fps                    | 4201        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 57344       |
|    agent/train/approx_kl             | 0.002265788 |
|    agent/train/clip_fraction         | 0.0809      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.329      |
|    agent/train/explained_variance    | 0.918       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0247      |
|    agent/train/n_updates             | 270         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.2         |
|    agent/rollout/ep_rew_wrapped_mean | -28.1        |
|    agent/time/fps                    | 4369         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 59392        |
|    agent/train/approx_kl             | 0.0013302495 |
|    agent/train/clip_fraction         | 0.0603       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.26        |
|    agent/train/explained_variance    | 0.98         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0297       |
|    agent/train/n_updates             | 280          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.4         |
|    agent/rollout/ep_rew_wrapped_mean | -27.2        |
|    agent/time/fps                    | 4361         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 61440        |
|    agent/train/approx_kl             | 0.0024877372 |
|    agent/train/clip_fraction         | 0.0549       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.233       |
|    agent/train/explained_variance    | 0.916        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0122       |
|    agent/train/n_updates             | 290          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.8         |
|    agent/rollout/ep_rew_wrapped_mean | -29.1        |
|    agent/time/fps                    | 4250         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 63488        |
|    agent/train/approx_kl             | 0.0040943148 |
|    agent/train/clip_fraction         | 0.138        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.326       |
|    agent/train/explained_variance    | 0.963        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00627      |
|    agent/train/n_updates             | 300          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.6         |
|    agent/rollout/ep_rew_wrapped_mean | -30.3        |
|    agent/time/fps                    | 4475         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 65536        |
|    agent/train/approx_kl             | 0.0019354236 |
|    agent/train/clip_fraction         | 0.0843       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.351       |
|    agent/train/explained_variance    | 0.975        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.000552    |
|    agent/train/n_updates             | 310          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 36.5         |
|    agent/rollout/ep_rew_wrapped_mean | -32          |
|    agent/time/fps                    | 4279         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 67584        |
|    agent/train/approx_kl             | 0.0026131673 |
|    agent/train/clip_fraction         | 0.0914       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.308       |
|    agent/train/explained_variance    | 0.983        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00361      |
|    agent/train/n_updates             | 320          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 38.5         |
|    agent/rollout/ep_rew_wrapped_mean | -33.4        |
|    agent/time/fps                    | 4218         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 69632        |
|    agent/train/approx_kl             | 0.0028740473 |
|    agent/train/clip_fraction         | 0.119        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.408       |
|    agent/train/explained_variance    | 0.971        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00373     |
|    agent/train/n_updates             | 330          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 41.2        |
|    agent/rollout/ep_rew_wrapped_mean | -34.9       |
|    agent/time/fps                    | 4181        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 71680       |
|    agent/train/approx_kl             | 0.003456635 |
|    agent/train/clip_fraction         | 0.112       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.445      |
|    agent/train/explained_variance    | 0.905       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0427      |
|    agent/train/n_updates             | 340         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 44.6       |
|    agent/rollout/ep_rew_wrapped_mean | -35.9      |
|    agent/time/fps                    | 4370       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 73728      |
|    agent/train/approx_kl             | 0.00406585 |
|    agent/train/clip_fraction         | 0.149      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.393     |
|    agent/train/explained_variance    | 0.976      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.00079   |
|    agent/train/n_updates             | 350        |
|    agent/train/policy_gradient_loss  | -0.00311 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 48.1         |
|    agent/rollout/ep_rew_wrapped_mean | -36.3        |
|    agent/time/fps                    | 4413         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 75776        |
|    agent/train/approx_kl             | 0.0055113877 |
|    agent/train/clip_fraction         | 0.277        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.549       |
|    agent/train/explained_variance    | 0.964        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0196       |
|    agent/train/n_updates             | 360          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 53.5        |
|    agent/rollout/ep_rew_wrapped_mean | -35.6       |
|    agent/time/fps                    | 4324        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 77824       |
|    agent/train/approx_kl             | 0.006346123 |
|    agent/train/clip_fraction         | 0.149       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.436      |
|    agent/train/explained_variance    | 0.956       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0405      |
|    agent/train/n_updates             | 370         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 56.3         |
|    agent/rollout/ep_rew_wrapped_mean | -35          |
|    agent/time/fps                    | 4298         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 79872        |
|    agent/train/approx_kl             | 0.0056691933 |
|    agent/train/clip_fraction         | 0.211        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.529       |
|    agent/train/explained_variance    | 0.975        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0156       |
|    agent/train/n_updates             | 380          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 59.8         |
|    agent/rollout/ep_rew_wrapped_mean | -34.1        |
|    agent/time/fps                    | 3386         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 81920        |
|    agent/train/approx_kl             | 0.0017975916 |
|    agent/train/clip_fraction         | 0.187        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.533       |
|    agent/train/explained_variance    | 0.95         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00344      |
|    agent/train/n_updates             | 390          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 61.4        |
|    agent/rollout/ep_rew_wrapped_mean | -32.4       |
|    agent/time/fps                    | 4406        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 83968       |
|    agent/train/approx_kl             | 0.005414089 |
|    agent/train/clip_fraction         | 0.159       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.539      |
|    agent/train/explained_variance    | 0.967       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0232      |
|    agent/train/n_updates             | 400         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 64.1        |
|    agent/rollout/ep_rew_wrapped_mean | -31.4       |
|    agent/time/fps                    | 4191        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 86016       |
|    agent/train/approx_kl             | 0.003590555 |
|    agent/train/clip_fraction         | 0.161       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.536      |
|    agent/train/explained_variance    | 0.962       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0348      |
|    agent/train/n_updates             | 410         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 67.4       |
|    agent/rollout/ep_rew_wrapped_mean | -27.4      |
|    agent/time/fps                    | 4507       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 88064      |
|    agent/train/approx_kl             | 0.00431975 |
|    agent/train/clip_fraction         | 0.134      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.544     |
|    agent/train/explained_variance    | 0.963      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.0175     |
|    agent/train/n_updates             | 420        |
|    agent/train/policy_gradient_loss  | 0.00075  

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 70.6         |
|    agent/rollout/ep_rew_wrapped_mean | -23.8        |
|    agent/time/fps                    | 4377         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 90112        |
|    agent/train/approx_kl             | 0.0023624161 |
|    agent/train/clip_fraction         | 0.153        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.547       |
|    agent/train/explained_variance    | 0.989        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0323       |
|    agent/train/n_updates             | 430          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 73.8         |
|    agent/rollout/ep_rew_wrapped_mean | -19.3        |
|    agent/time/fps                    | 4384         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 92160        |
|    agent/train/approx_kl             | 0.0040885126 |
|    agent/train/clip_fraction         | 0.164        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.56        |
|    agent/train/explained_variance    | 0.988        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00162     |
|    agent/train/n_updates             | 440          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 77.5         |
|    agent/rollout/ep_rew_wrapped_mean | -15          |
|    agent/time/fps                    | 4245         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 94208        |
|    agent/train/approx_kl             | 0.0039278846 |
|    agent/train/clip_fraction         | 0.157        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.552       |
|    agent/train/explained_variance    | 0.99         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0349       |
|    agent/train/n_updates             | 450          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 81.7        |
|    agent/rollout/ep_rew_wrapped_mean | -8.33       |
|    agent/time/fps                    | 4175        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 96256       |
|    agent/train/approx_kl             | 0.002835322 |
|    agent/train/clip_fraction         | 0.113       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.546      |
|    agent/train/explained_variance    | 0.994       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0177      |
|    agent/train/n_updates             | 460         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 85.8        |
|    agent/rollout/ep_rew_wrapped_mean | -2.17       |
|    agent/time/fps                    | 4201        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 98304       |
|    agent/train/approx_kl             | 0.005936977 |
|    agent/train/clip_fraction         | 0.18        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.554      |
|    agent/train/explained_variance    | 0.995       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.035       |
|    agent/train/n_updates             | 470         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 89           |
|    agent/rollout/ep_rew_wrapped_mean | 5.08         |
|    agent/time/fps                    | 4052         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 100352       |
|    agent/train/approx_kl             | 0.0032224555 |
|    agent/train/clip_fraction         | 0.151        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.554       |
|    agent/train/explained_variance    | 0.996        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0128       |
|    agent/train/n_updates             | 480          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 93.9         |
|    agent/rollout/ep_rew_wrapped_mean | 11.1         |
|    agent/time/fps                    | 4223         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 102400       |
|    agent/train/approx_kl             | 0.0025386694 |
|    agent/train/clip_fraction         | 0.154        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.554       |
|    agent/train/explained_variance    | 0.997        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0121       |
|    agent/train/n_updates             | 490          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 97.5         |
|    agent/rollout/ep_rew_wrapped_mean | 18.5         |
|    agent/time/fps                    | 4393         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 104448       |
|    agent/train/approx_kl             | 0.0061153183 |
|    agent/train/clip_fraction         | 0.245        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.556       |
|    agent/train/explained_variance    | 0.998        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00532      |
|    agent/train/n_updates             | 500          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 101         |
|    agent/rollout/ep_rew_wrapped_mean | 25.3        |
|    agent/time/fps                    | 4265        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 106496      |
|    agent/train/approx_kl             | 0.003233476 |
|    agent/train/clip_fraction         | 0.189       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.574      |
|    agent/train/explained_variance    | 0.998       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00249     |
|    agent/train/n_updates             | 510         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 105         |
|    agent/rollout/ep_rew_wrapped_mean | 32.3        |
|    agent/time/fps                    | 4160        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 108544      |
|    agent/train/approx_kl             | 0.004280553 |
|    agent/train/clip_fraction         | 0.179       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.56       |
|    agent/train/explained_variance    | 0.998       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0199      |
|    agent/train/n_updates             | 520         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 109          |
|    agent/rollout/ep_rew_wrapped_mean | 39.6         |
|    agent/time/fps                    | 4469         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 110592       |
|    agent/train/approx_kl             | 0.0032328994 |
|    agent/train/clip_fraction         | 0.185        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.546       |
|    agent/train/explained_variance    | 0.999        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0087       |
|    agent/train/n_updates             | 530          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 113         |
|    agent/rollout/ep_rew_wrapped_mean | 48.6        |
|    agent/time/fps                    | 4284        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 112640      |
|    agent/train/approx_kl             | 0.003212351 |
|    agent/train/clip_fraction         | 0.171       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.534      |
|    agent/train/explained_variance    | 0.999       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0135      |
|    agent/train/n_updates             | 540         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 116         |
|    agent/rollout/ep_rew_wrapped_mean | 57.4        |
|    agent/time/fps                    | 4160        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 114688      |
|    agent/train/approx_kl             | 0.003376196 |
|    agent/train/clip_fraction         | 0.204       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.528      |
|    agent/train/explained_variance    | 0.999       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00192    |
|    agent/train/n_updates             | 550         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 119          |
|    agent/rollout/ep_rew_wrapped_mean | 65.5         |
|    agent/time/fps                    | 4190         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 116736       |
|    agent/train/approx_kl             | 0.0035515144 |
|    agent/train/clip_fraction         | 0.187        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.519       |
|    agent/train/explained_variance    | 0.999        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0107      |
|    agent/train/n_updates             | 560          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 119         |
|    agent/rollout/ep_rew_wrapped_mean | 73.1        |
|    agent/time/fps                    | 4060        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 118784      |
|    agent/train/approx_kl             | 0.003248849 |
|    agent/train/clip_fraction         | 0.175       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.525      |
|    agent/train/explained_variance    | 0.999       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00238     |
|    agent/train/n_updates             | 570         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 121          |
|    agent/rollout/ep_rew_wrapped_mean | 80.4         |
|    agent/time/fps                    | 4412         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 120832       |
|    agent/train/approx_kl             | 0.0045553534 |
|    agent/train/clip_fraction         | 0.182        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.499       |
|    agent/train/explained_variance    | 0.999        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.000533    |
|    agent/train/n_updates             | 580          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 103 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 122          |
|    agent/rollout/ep_rew_wrapped_mean | 86.9         |
|    agent/time/fps                    | 4296         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 122880       |
|    agent/train/approx_kl             | 0.0043001724 |
|    agent/train/clip_fraction         | 0.144        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.492       |
|    agent/train/explained_variance    | 1            |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00372      |
|    agent/train/n_updates             | 590          |
|    agent/trai

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 123          |
|    agent/rollout/ep_rew_wrapped_mean | 93.2         |
|    agent/time/fps                    | 4188         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 124928       |
|    agent/train/approx_kl             | 0.0037575176 |
|    agent/train/clip_fraction         | 0.191        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.479       |
|    agent/train/explained_variance    | 1            |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0075      |
|    agent/train/n_updates             | 600          |
|    agent/train



VBox(children=(Label(value='0.110 MB of 0.110 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▁▁▁▂▄▆▇███▇▇▇▇███████▇▇▇▇▇▇██▇██████████
time/fps,█▃▃▃▂▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂
train/approx_kl,▂▂▂▂▃▁▁▂▃▃▆▂▂▄▄▄▂▃▁▂▂▅▄▄▂▂▃▃▄▆▃▄▃▅▄▃▂█▃▃
train/clip_fraction,█▅▃▃▄▁▃▄▅▄▄▄▂▃▃▃▂▃▂▂▂▂▂▂▂▂▃▂▂▁▂▂▁▁▂▂▂▂▂▂
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▂▂▃▄▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇████▇████████
train/explained_variance,▁▇▇█████████████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,126.42
time/fps,2856.0
train/approx_kl,0.00454
train/clip_fraction,0.0939
train/clip_range,0.1
train/entropy_loss,-0.17805
train/explained_variance,1.0
train/learning_rate,0.002


100%|████████████████████████████████████████████| 5/5 [25:15<00:00, 303.04s/it]
  0%|                                                     | 0/5 [00:00<?, ?it/s]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011138488432495958, max=1.0…

Query schedule: [30, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3]
Collecting 60 fragments (12000 transitions)
Requested 7200 transitions but only 0 in buffer. Sampling 7200 additional transitions.
Sampling 4800 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 30 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 22.5     |
|    agent/rollout/ep_rew_wrapped_mean | 107      |
|    agent/time/fps                    | 4322     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 22.5     |
|    agent/rollout/ep_rew_wrapped_mean | 107      |
|    agent/time/fps                    | 4.32e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.5         |
|    agent/rollout/ep_rew_wrapped_mean | 82.6         |
|    agent/time/fps                    | 4365         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0029051427 |
|    agent/train/clip_fraction         | 0.113        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.691       |
|    agent/train/explained_variance    | 0.0917       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00482     |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.8         |
|    agent/rollout/ep_rew_wrapped_mean | 67.4         |
|    agent/time/fps                    | 4467         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0024258865 |
|    agent/train/clip_fraction         | 0.117        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.688       |
|    agent/train/explained_variance    | 0.241        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00696     |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 21.5        |
|    agent/rollout/ep_rew_wrapped_mean | 55.5        |
|    agent/time/fps                    | 4508        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 8192        |
|    agent/train/approx_kl             | 0.002295774 |
|    agent/train/clip_fraction         | 0.13        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.68       |
|    agent/train/explained_variance    | 0.145       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0113     |
|    agent/train/n_updates             | 30          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 19.6        |
|    agent/rollout/ep_rew_wrapped_mean | 46.7        |
|    agent/time/fps                    | 4480        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 10240       |
|    agent/train/approx_kl             | 0.004093293 |
|    agent/train/clip_fraction         | 0.286       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.667      |
|    agent/train/explained_variance    | 0.784       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0233     |
|    agent/train/n_updates             | 40          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19.5         |
|    agent/rollout/ep_rew_wrapped_mean | 39.4         |
|    agent/time/fps                    | 4403         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0043473598 |
|    agent/train/clip_fraction         | 0.293        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.657       |
|    agent/train/explained_variance    | 0.818        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0166      |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19.4         |
|    agent/rollout/ep_rew_wrapped_mean | 33.9         |
|    agent/time/fps                    | 4402         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0030128148 |
|    agent/train/clip_fraction         | 0.18         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.633       |
|    agent/train/explained_variance    | 0.915        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0169      |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 18.5        |
|    agent/rollout/ep_rew_wrapped_mean | 29.6        |
|    agent/time/fps                    | 4374        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 16384       |
|    agent/train/approx_kl             | 0.003108301 |
|    agent/train/clip_fraction         | 0.181       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.618      |
|    agent/train/explained_variance    | 0.903       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0191     |
|    agent/train/n_updates             | 70          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.4         |
|    agent/rollout/ep_rew_wrapped_mean | 25.8         |
|    agent/time/fps                    | 4304         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0027906548 |
|    agent/train/clip_fraction         | 0.174        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.614       |
|    agent/train/explained_variance    | 0.933        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00149      |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 17.6        |
|    agent/rollout/ep_rew_wrapped_mean | 21.9        |
|    agent/time/fps                    | 4371        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 20480       |
|    agent/train/approx_kl             | 0.002354896 |
|    agent/train/clip_fraction         | 0.149       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.587      |
|    agent/train/explained_variance    | 0.931       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0175     |
|    agent/train/n_updates             | 90          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.9         |
|    agent/rollout/ep_rew_wrapped_mean | 18.1         |
|    agent/time/fps                    | 4509         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0061316304 |
|    agent/train/clip_fraction         | 0.217        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.544       |
|    agent/train/explained_variance    | 0.963        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0282      |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.4         |
|    agent/rollout/ep_rew_wrapped_mean | 14.7         |
|    agent/time/fps                    | 4384         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0041527594 |
|    agent/train/clip_fraction         | 0.146        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.506       |
|    agent/train/explained_variance    | 0.933        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00602     |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.9         |
|    agent/rollout/ep_rew_wrapped_mean | 11.7         |
|    agent/time/fps                    | 4335         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 26624        |
|    agent/train/approx_kl             | 0.0038947575 |
|    agent/train/clip_fraction         | 0.145        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.476       |
|    agent/train/explained_variance    | 0.841        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.036        |
|    agent/train/n_updates             | 120          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 15.5        |
|    agent/rollout/ep_rew_wrapped_mean | 6.66        |
|    agent/time/fps                    | 4359        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 28672       |
|    agent/train/approx_kl             | 0.003550969 |
|    agent/train/clip_fraction         | 0.172       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.465      |
|    agent/train/explained_variance    | 0.939       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00266    |
|    agent/train/n_updates             | 130         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.3         |
|    agent/rollout/ep_rew_wrapped_mean | 2.82         |
|    agent/time/fps                    | 4467         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0022321786 |
|    agent/train/clip_fraction         | 0.117        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.452       |
|    agent/train/explained_variance    | 0.887        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00839     |
|    agent/train/n_updates             | 140          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 15.7        |
|    agent/rollout/ep_rew_wrapped_mean | -6.22       |
|    agent/time/fps                    | 4447        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 32768       |
|    agent/train/approx_kl             | 0.005894516 |
|    agent/train/clip_fraction         | 0.168       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.454      |
|    agent/train/explained_variance    | 0.814       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0287     |
|    agent/train/n_updates             | 150         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.7         |
|    agent/rollout/ep_rew_wrapped_mean | -18.2        |
|    agent/time/fps                    | 4435         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0027657938 |
|    agent/train/clip_fraction         | 0.115        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.502       |
|    agent/train/explained_variance    | 0.879        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00974     |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16           |
|    agent/rollout/ep_rew_wrapped_mean | -24.8        |
|    agent/time/fps                    | 3631         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 36864        |
|    agent/train/approx_kl             | 0.0017564917 |
|    agent/train/clip_fraction         | 0.0823       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.474       |
|    agent/train/explained_variance    | -0.426       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0111       |
|    agent/train/n_updates             | 170          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16           |
|    agent/rollout/ep_rew_wrapped_mean | -26.1        |
|    agent/time/fps                    | 4327         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 38912        |
|    agent/train/approx_kl             | 0.0028350856 |
|    agent/train/clip_fraction         | 0.116        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.441       |
|    agent/train/explained_variance    | 0.922        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0138      |
|    agent/train/n_updates             | 180          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.3         |
|    agent/rollout/ep_rew_wrapped_mean | -27.3        |
|    agent/time/fps                    | 4269         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 40960        |
|    agent/train/approx_kl             | 0.0022933558 |
|    agent/train/clip_fraction         | 0.0831       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.496       |
|    agent/train/explained_variance    | 0.51         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00949     |
|    agent/train/n_updates             | 190          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17           |
|    agent/rollout/ep_rew_wrapped_mean | -27.5        |
|    agent/time/fps                    | 2052         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0014243552 |
|    agent/train/clip_fraction         | 0.0771       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.486       |
|    agent/train/explained_variance    | 0.758        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0034      |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 17.4        |
|    agent/rollout/ep_rew_wrapped_mean | -27.2       |
|    agent/time/fps                    | 1661        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 45056       |
|    agent/train/approx_kl             | 0.004219709 |
|    agent/train/clip_fraction         | 0.109       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.467      |
|    agent/train/explained_variance    | 0.828       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0316     |
|    agent/train/n_updates             | 210         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 17.6        |
|    agent/rollout/ep_rew_wrapped_mean | -27.3       |
|    agent/time/fps                    | 4120        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 47104       |
|    agent/train/approx_kl             | 0.008962264 |
|    agent/train/clip_fraction         | 0.158       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.413      |
|    agent/train/explained_variance    | 0.897       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0131     |
|    agent/train/n_updates             | 220         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17.4         |
|    agent/rollout/ep_rew_wrapped_mean | -27.6        |
|    agent/time/fps                    | 4512         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 49152        |
|    agent/train/approx_kl             | 0.0052207476 |
|    agent/train/clip_fraction         | 0.0987       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.395       |
|    agent/train/explained_variance    | 0.923        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0189      |
|    agent/train/n_updates             | 230          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17.4         |
|    agent/rollout/ep_rew_wrapped_mean | -27.5        |
|    agent/time/fps                    | 4266         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 51200        |
|    agent/train/approx_kl             | 0.0045193485 |
|    agent/train/clip_fraction         | 0.142        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.39        |
|    agent/train/explained_variance    | 0.959        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00871     |
|    agent/train/n_updates             | 240          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17.1         |
|    agent/rollout/ep_rew_wrapped_mean | -27.3        |
|    agent/time/fps                    | 4352         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 53248        |
|    agent/train/approx_kl             | 0.0049887486 |
|    agent/train/clip_fraction         | 0.109        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.386       |
|    agent/train/explained_variance    | 0.948        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00612     |
|    agent/train/n_updates             | 250          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.4         |
|    agent/rollout/ep_rew_wrapped_mean | -26.8        |
|    agent/time/fps                    | 4491         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 55296        |
|    agent/train/approx_kl             | 0.0065875025 |
|    agent/train/clip_fraction         | 0.165        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.385       |
|    agent/train/explained_variance    | 0.977        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0107      |
|    agent/train/n_updates             | 260          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.8         |
|    agent/rollout/ep_rew_wrapped_mean | -25.7        |
|    agent/time/fps                    | 4472         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 57344        |
|    agent/train/approx_kl             | 0.0020117275 |
|    agent/train/clip_fraction         | 0.0791       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.357       |
|    agent/train/explained_variance    | 0.98         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00552      |
|    agent/train/n_updates             | 270          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 15.8        |
|    agent/rollout/ep_rew_wrapped_mean | -24.2       |
|    agent/time/fps                    | 4376        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 59392       |
|    agent/train/approx_kl             | 0.003814422 |
|    agent/train/clip_fraction         | 0.108       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.377      |
|    agent/train/explained_variance    | 0.979       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00963     |
|    agent/train/n_updates             | 280         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.7         |
|    agent/rollout/ep_rew_wrapped_mean | -23.1        |
|    agent/time/fps                    | 4359         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 61440        |
|    agent/train/approx_kl             | 0.0032623103 |
|    agent/train/clip_fraction         | 0.116        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.371       |
|    agent/train/explained_variance    | 0.971        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0147      |
|    agent/train/n_updates             | 290          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.3         |
|    agent/rollout/ep_rew_wrapped_mean | -21.6        |
|    agent/time/fps                    | 4440         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 63488        |
|    agent/train/approx_kl             | 0.0034069528 |
|    agent/train/clip_fraction         | 0.134        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.383       |
|    agent/train/explained_variance    | 0.954        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0106      |
|    agent/train/n_updates             | 300          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.9         |
|    agent/rollout/ep_rew_wrapped_mean | -20.3        |
|    agent/time/fps                    | 4122         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 65536        |
|    agent/train/approx_kl             | 0.0010052394 |
|    agent/train/clip_fraction         | 0.0609       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.321       |
|    agent/train/explained_variance    | 0.95         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00626     |
|    agent/train/n_updates             | 310          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.7         |
|    agent/rollout/ep_rew_wrapped_mean | -19.5        |
|    agent/time/fps                    | 4330         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 67584        |
|    agent/train/approx_kl             | 0.0032157097 |
|    agent/train/clip_fraction         | 0.105        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.351       |
|    agent/train/explained_variance    | 0.951        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00806     |
|    agent/train/n_updates             | 320          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.4        |
|    agent/rollout/ep_rew_wrapped_mean | -18.3       |
|    agent/time/fps                    | 4367        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 69632       |
|    agent/train/approx_kl             | 0.012069025 |
|    agent/train/clip_fraction         | 0.243       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.366      |
|    agent/train/explained_variance    | 0.94        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0214     |
|    agent/train/n_updates             | 330         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.3        |
|    agent/rollout/ep_rew_wrapped_mean | -17.1       |
|    agent/time/fps                    | 4489        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 71680       |
|    agent/train/approx_kl             | 0.005722295 |
|    agent/train/clip_fraction         | 0.223       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.365      |
|    agent/train/explained_variance    | 0.923       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0214      |
|    agent/train/n_updates             | 340         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.2         |
|    agent/rollout/ep_rew_wrapped_mean | -15.2        |
|    agent/time/fps                    | 4477         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 73728        |
|    agent/train/approx_kl             | 0.0036322298 |
|    agent/train/clip_fraction         | 0.147        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.343       |
|    agent/train/explained_variance    | 0.926        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00117     |
|    agent/train/n_updates             | 350          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.1         |
|    agent/rollout/ep_rew_wrapped_mean | -14.4        |
|    agent/time/fps                    | 4199         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 75776        |
|    agent/train/approx_kl             | 0.0029918167 |
|    agent/train/clip_fraction         | 0.113        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.328       |
|    agent/train/explained_variance    | 0.931        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0102      |
|    agent/train/n_updates             | 360          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14          |
|    agent/rollout/ep_rew_wrapped_mean | -13.9       |
|    agent/time/fps                    | 4164        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 77824       |
|    agent/train/approx_kl             | 0.005986671 |
|    agent/train/clip_fraction         | 0.196       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.328      |
|    agent/train/explained_variance    | 0.939       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0133     |
|    agent/train/n_updates             | 370         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14          |
|    agent/rollout/ep_rew_wrapped_mean | -14.3       |
|    agent/time/fps                    | 4270        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 79872       |
|    agent/train/approx_kl             | 0.005204282 |
|    agent/train/clip_fraction         | 0.138       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.285      |
|    agent/train/explained_variance    | 0.914       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0123     |
|    agent/train/n_updates             | 380         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 13.8        |
|    agent/rollout/ep_rew_wrapped_mean | -14.5       |
|    agent/time/fps                    | 4442        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 81920       |
|    agent/train/approx_kl             | 0.009472188 |
|    agent/train/clip_fraction         | 0.137       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.225      |
|    agent/train/explained_variance    | 0.953       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.021      |
|    agent/train/n_updates             | 390         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 13.3         |
|    agent/rollout/ep_rew_wrapped_mean | -15.1        |
|    agent/time/fps                    | 3875         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 83968        |
|    agent/train/approx_kl             | 0.0053176787 |
|    agent/train/clip_fraction         | 0.101        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.226       |
|    agent/train/explained_variance    | 0.964        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00841     |
|    agent/train/n_updates             | 400          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 13          |
|    agent/rollout/ep_rew_wrapped_mean | -16         |
|    agent/time/fps                    | 4259        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 86016       |
|    agent/train/approx_kl             | 0.007645579 |
|    agent/train/clip_fraction         | 0.0845      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.184      |
|    agent/train/explained_variance    | 0.9         |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00349    |
|    agent/train/n_updates             | 410         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 12.5         |
|    agent/rollout/ep_rew_wrapped_mean | -16.4        |
|    agent/time/fps                    | 4265         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 88064        |
|    agent/train/approx_kl             | 0.0064620767 |
|    agent/train/clip_fraction         | 0.101        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.16        |
|    agent/train/explained_variance    | 0.975        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00827     |
|    agent/train/n_updates             | 420          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 12.1         |
|    agent/rollout/ep_rew_wrapped_mean | -16.6        |
|    agent/time/fps                    | 4399         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 90112        |
|    agent/train/approx_kl             | 0.0052088485 |
|    agent/train/clip_fraction         | 0.0741       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.141       |
|    agent/train/explained_variance    | 0.941        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00263     |
|    agent/train/n_updates             | 430          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 11.6         |
|    agent/rollout/ep_rew_wrapped_mean | -17.2        |
|    agent/time/fps                    | 4206         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 92160        |
|    agent/train/approx_kl             | 0.0054910444 |
|    agent/train/clip_fraction         | 0.156        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.204       |
|    agent/train/explained_variance    | 0.905        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0248      |
|    agent/train/n_updates             | 440          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 10.7        |
|    agent/rollout/ep_rew_wrapped_mean | -18.2       |
|    agent/time/fps                    | 4346        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 94208       |
|    agent/train/approx_kl             | 0.005048075 |
|    agent/train/clip_fraction         | 0.103       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.194      |
|    agent/train/explained_variance    | 0.895       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00135    |
|    agent/train/n_updates             | 450         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 10           |
|    agent/rollout/ep_rew_wrapped_mean | -19.3        |
|    agent/time/fps                    | 4214         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 96256        |
|    agent/train/approx_kl             | 0.0038922788 |
|    agent/train/clip_fraction         | 0.105        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.189       |
|    agent/train/explained_variance    | 0.948        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0117      |
|    agent/train/n_updates             | 460          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 9.5          |
|    agent/rollout/ep_rew_wrapped_mean | -20          |
|    agent/time/fps                    | 3980         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 98304        |
|    agent/train/approx_kl             | 0.0028493765 |
|    agent/train/clip_fraction         | 0.0941       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.198       |
|    agent/train/explained_variance    | 0.902        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00357     |
|    agent/train/n_updates             | 470          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 9.3         |
|    agent/rollout/ep_rew_wrapped_mean | -20.4       |
|    agent/time/fps                    | 4053        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 100352      |
|    agent/train/approx_kl             | 0.007137655 |
|    agent/train/clip_fraction         | 0.0953      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.199      |
|    agent/train/explained_variance    | 0.902       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.006      |
|    agent/train/n_updates             | 480         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.92         |
|    agent/rollout/ep_rew_wrapped_mean | -21.5        |
|    agent/time/fps                    | 4080         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 102400       |
|    agent/train/approx_kl             | 0.0029055094 |
|    agent/train/clip_fraction         | 0.0743       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.176       |
|    agent/train/explained_variance    | 0.978        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.000322     |
|    agent/train/n_updates             | 490          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 8.62       |
|    agent/rollout/ep_rew_wrapped_mean | -22.4      |
|    agent/time/fps                    | 4162       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 104448     |
|    agent/train/approx_kl             | 0.00274564 |
|    agent/train/clip_fraction         | 0.0668     |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.208     |
|    agent/train/explained_variance    | 0.939      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.0068    |
|    agent/train/n_updates             | 500        |
|    agent/train/policy_gradient_loss  | 0.000685 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.58        |
|    agent/rollout/ep_rew_wrapped_mean | -23.2       |
|    agent/time/fps                    | 3940        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 106496      |
|    agent/train/approx_kl             | 0.007822788 |
|    agent/train/clip_fraction         | 0.0849      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.172      |
|    agent/train/explained_variance    | 0.931       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00587    |
|    agent/train/n_updates             | 510         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.51        |
|    agent/rollout/ep_rew_wrapped_mean | -23.7       |
|    agent/time/fps                    | 3038        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 108544      |
|    agent/train/approx_kl             | 0.004097365 |
|    agent/train/clip_fraction         | 0.0868      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.18       |
|    agent/train/explained_variance    | 0.927       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0176     |
|    agent/train/n_updates             | 520         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.44        |
|    agent/rollout/ep_rew_wrapped_mean | -24         |
|    agent/time/fps                    | 4209        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 110592      |
|    agent/train/approx_kl             | 0.013373956 |
|    agent/train/clip_fraction         | 0.114       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.197      |
|    agent/train/explained_variance    | 0.919       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0062      |
|    agent/train/n_updates             | 530         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.45         |
|    agent/rollout/ep_rew_wrapped_mean | -24.3        |
|    agent/time/fps                    | 4277         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 112640       |
|    agent/train/approx_kl             | 0.0036414242 |
|    agent/train/clip_fraction         | 0.0986       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.196       |
|    agent/train/explained_variance    | 0.913        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00256     |
|    agent/train/n_updates             | 540          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.48        |
|    agent/rollout/ep_rew_wrapped_mean | -24.1       |
|    agent/time/fps                    | 3001        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 114688      |
|    agent/train/approx_kl             | 0.004136878 |
|    agent/train/clip_fraction         | 0.0947      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.172      |
|    agent/train/explained_variance    | 0.95        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.000633   |
|    agent/train/n_updates             | 550         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.61        |
|    agent/rollout/ep_rew_wrapped_mean | -23.9       |
|    agent/time/fps                    | 4081        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 116736      |
|    agent/train/approx_kl             | 0.004155646 |
|    agent/train/clip_fraction         | 0.0789      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.174      |
|    agent/train/explained_variance    | 0.94        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0113     |
|    agent/train/n_updates             | 560         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.88         |
|    agent/rollout/ep_rew_wrapped_mean | -23.5        |
|    agent/time/fps                    | 3834         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 118784       |
|    agent/train/approx_kl             | 0.0041190395 |
|    agent/train/clip_fraction         | 0.0965       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.194       |
|    agent/train/explained_variance    | 0.939        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00911      |
|    agent/train/n_updates             | 570          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 9.08         |
|    agent/rollout/ep_rew_wrapped_mean | -22.8        |
|    agent/time/fps                    | 4173         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 120832       |
|    agent/train/approx_kl             | 0.0025209908 |
|    agent/train/clip_fraction         | 0.0904       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.219       |
|    agent/train/explained_variance    | 0.916        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.000661    |
|    agent/train/n_updates             | 580          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 103 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 9.39         |
|    agent/rollout/ep_rew_wrapped_mean | -22.5        |
|    agent/time/fps                    | 4147         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 122880       |
|    agent/train/approx_kl             | 0.0021196965 |
|    agent/train/clip_fraction         | 0.091        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.298       |
|    agent/train/explained_variance    | 0.0782       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0478       |
|    agent/train/n_updates             | 590          |
|    agent/trai

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 9.6          |
|    agent/rollout/ep_rew_wrapped_mean | -22.9        |
|    agent/time/fps                    | 4167         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 124928       |
|    agent/train/approx_kl             | 0.0021361876 |
|    agent/train/clip_fraction         | 0.102        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.232       |
|    agent/train/explained_variance    | 0.965        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00642      |
|    agent/train/n_updates             | 600          |
|    agent/train



VBox(children=(Label(value='0.111 MB of 0.111 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▁▁▁▁▁▁▁▃▅▆█████████▇▇▇▇▇▇▇▇██▇███████▇▇▇
time/fps,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/approx_kl,▁▂▂▁▁▂▂▃▁▂▆▃█▂▁▂▂▂▂▃▅▂▄▁▃▄▃▄▂▄▄▃▂▆▂▂▂▁▅▄
train/clip_fraction,▅▃▃▂▂▂▃█▆▆▆▄▄▃▃▂▃▂▃▂▄▂▃▁▂▂▁▃▂▃▂▂▂▂▁▁▂▁▂▁
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▃▅▅▆▅▂▂▃▄▅▆▆▆▇▇▇▇▇▇▇▇█▇██▇██▇█████████
train/explained_variance,▁█▇█████████████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,140.13
time/fps,2852.0
train/approx_kl,0.01174
train/clip_fraction,0.09219
train/clip_range,0.1
train/entropy_loss,-0.1274
train/explained_variance,0.99992
train/learning_rate,0.002


 20%|████████▊                                   | 1/5 [05:21<21:26, 321.71s/it]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011276192588007284, max=1.0…

Query schedule: [30, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3]
Collecting 60 fragments (12000 transitions)
Requested 7200 transitions but only 0 in buffer. Sampling 7200 additional transitions.
Sampling 4800 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 30 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 28       |
|    agent/rollout/ep_rew_wrapped_mean | 295      |
|    agent/time/fps                    | 4337     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 28       |
|    agent/rollout/ep_rew_wrapped_mean | 295      |
|    agent/time/fps                    | 4.34e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.6         |
|    agent/rollout/ep_rew_wrapped_mean | 247          |
|    agent/time/fps                    | 3182         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0034097936 |
|    agent/train/clip_fraction         | 0.164        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.69        |
|    agent/train/explained_variance    | 0.438        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0246       |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.4         |
|    agent/rollout/ep_rew_wrapped_mean | 219          |
|    agent/time/fps                    | 4437         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0032691662 |
|    agent/train/clip_fraction         | 0.191        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.689       |
|    agent/train/explained_variance    | 0.676        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00368     |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.6         |
|    agent/rollout/ep_rew_wrapped_mean | 198          |
|    agent/time/fps                    | 4493         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0037095144 |
|    agent/train/clip_fraction         | 0.177        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.686       |
|    agent/train/explained_variance    | 0.661        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00787      |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 25.8        |
|    agent/rollout/ep_rew_wrapped_mean | 183         |
|    agent/time/fps                    | 4516        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 10240       |
|    agent/train/approx_kl             | 0.004262152 |
|    agent/train/clip_fraction         | 0.246       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.682      |
|    agent/train/explained_variance    | 0.466       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00937    |
|    agent/train/n_updates             | 40          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.8         |
|    agent/rollout/ep_rew_wrapped_mean | 170          |
|    agent/time/fps                    | 4476         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0032090906 |
|    agent/train/clip_fraction         | 0.128        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.676       |
|    agent/train/explained_variance    | 0.929        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0135      |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 23.7       |
|    agent/rollout/ep_rew_wrapped_mean | 159        |
|    agent/time/fps                    | 4521       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 14336      |
|    agent/train/approx_kl             | 0.00320241 |
|    agent/train/clip_fraction         | 0.194      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.677     |
|    agent/train/explained_variance    | 0.833      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.0253    |
|    agent/train/n_updates             | 60         |
|    agent/train/policy_gradient_loss  | -0.00875 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.2         |
|    agent/rollout/ep_rew_wrapped_mean | 151          |
|    agent/time/fps                    | 4290         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0030220964 |
|    agent/train/clip_fraction         | 0.153        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.666       |
|    agent/train/explained_variance    | 0.93         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00327     |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 22.9        |
|    agent/rollout/ep_rew_wrapped_mean | 143         |
|    agent/time/fps                    | 4521        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 18432       |
|    agent/train/approx_kl             | 0.002435437 |
|    agent/train/clip_fraction         | 0.125       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.662      |
|    agent/train/explained_variance    | 0.882       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00226     |
|    agent/train/n_updates             | 80          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.1         |
|    agent/rollout/ep_rew_wrapped_mean | 137          |
|    agent/time/fps                    | 4343         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0016820125 |
|    agent/train/clip_fraction         | 0.0847       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.652       |
|    agent/train/explained_variance    | 0.792        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00641      |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.3         |
|    agent/rollout/ep_rew_wrapped_mean | 132          |
|    agent/time/fps                    | 4237         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0017858639 |
|    agent/train/clip_fraction         | 0.118        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.66        |
|    agent/train/explained_variance    | 0.846        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0121      |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.8         |
|    agent/rollout/ep_rew_wrapped_mean | 127          |
|    agent/time/fps                    | 4363         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0023977307 |
|    agent/train/clip_fraction         | 0.107        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.642       |
|    agent/train/explained_variance    | 0.894        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00969     |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.2         |
|    agent/rollout/ep_rew_wrapped_mean | 123          |
|    agent/time/fps                    | 4161         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 26624        |
|    agent/train/approx_kl             | 0.0030406953 |
|    agent/train/clip_fraction         | 0.142        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.637       |
|    agent/train/explained_variance    | 0.947        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00977     |
|    agent/train/n_updates             | 120          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19.8         |
|    agent/rollout/ep_rew_wrapped_mean | 110          |
|    agent/time/fps                    | 4323         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 28672        |
|    agent/train/approx_kl             | 0.0034237872 |
|    agent/train/clip_fraction         | 0.19         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.641       |
|    agent/train/explained_variance    | 0.943        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0254      |
|    agent/train/n_updates             | 130          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 19.4        |
|    agent/rollout/ep_rew_wrapped_mean | 94.4        |
|    agent/time/fps                    | 4464        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 30720       |
|    agent/train/approx_kl             | 0.003794412 |
|    agent/train/clip_fraction         | 0.167       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.639      |
|    agent/train/explained_variance    | 0.963       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00301     |
|    agent/train/n_updates             | 140         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 19           |
|    agent/rollout/ep_rew_wrapped_mean | 80.5         |
|    agent/time/fps                    | 4441         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0027181772 |
|    agent/train/clip_fraction         | 0.148        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.643       |
|    agent/train/explained_variance    | 0.93         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0198      |
|    agent/train/n_updates             | 150          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 18.6        |
|    agent/rollout/ep_rew_wrapped_mean | 70.2        |
|    agent/time/fps                    | 4489        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 34816       |
|    agent/train/approx_kl             | 0.004045548 |
|    agent/train/clip_fraction         | 0.192       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.652      |
|    agent/train/explained_variance    | 0.963       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0166     |
|    agent/train/n_updates             | 160         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.8         |
|    agent/rollout/ep_rew_wrapped_mean | 62.3         |
|    agent/time/fps                    | 4321         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 36864        |
|    agent/train/approx_kl             | 0.0040571927 |
|    agent/train/clip_fraction         | 0.214        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.656       |
|    agent/train/explained_variance    | 0.894        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0201      |
|    agent/train/n_updates             | 170          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 20.9        |
|    agent/rollout/ep_rew_wrapped_mean | 61.6        |
|    agent/time/fps                    | 4420        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 38912       |
|    agent/train/approx_kl             | 0.003572606 |
|    agent/train/clip_fraction         | 0.211       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.649      |
|    agent/train/explained_variance    | 0.951       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00622    |
|    agent/train/n_updates             | 180         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.7         |
|    agent/rollout/ep_rew_wrapped_mean | 61.4         |
|    agent/time/fps                    | 4459         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 40960        |
|    agent/train/approx_kl             | 0.0027261546 |
|    agent/train/clip_fraction         | 0.155        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.657       |
|    agent/train/explained_variance    | 0.915        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.000543    |
|    agent/train/n_updates             | 190          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.2         |
|    agent/rollout/ep_rew_wrapped_mean | 61.4         |
|    agent/time/fps                    | 4304         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0029774825 |
|    agent/train/clip_fraction         | 0.173        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.657       |
|    agent/train/explained_variance    | 0.968        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0123       |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25           |
|    agent/rollout/ep_rew_wrapped_mean | 62           |
|    agent/time/fps                    | 4276         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 45056        |
|    agent/train/approx_kl             | 0.0036525184 |
|    agent/train/clip_fraction         | 0.196        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.643       |
|    agent/train/explained_variance    | 0.972        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00969     |
|    agent/train/n_updates             | 210          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.8         |
|    agent/rollout/ep_rew_wrapped_mean | 63           |
|    agent/time/fps                    | 4495         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 47104        |
|    agent/train/approx_kl             | 0.0038442854 |
|    agent/train/clip_fraction         | 0.203        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.637       |
|    agent/train/explained_variance    | 0.976        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00357     |
|    agent/train/n_updates             | 220          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.2         |
|    agent/rollout/ep_rew_wrapped_mean | 64.6         |
|    agent/time/fps                    | 4439         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 49152        |
|    agent/train/approx_kl             | 0.0036413358 |
|    agent/train/clip_fraction         | 0.191        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.624       |
|    agent/train/explained_variance    | 0.975        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.000973     |
|    agent/train/n_updates             | 230          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.1         |
|    agent/rollout/ep_rew_wrapped_mean | 66.7         |
|    agent/time/fps                    | 4307         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 51200        |
|    agent/train/approx_kl             | 0.0038334585 |
|    agent/train/clip_fraction         | 0.129        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.63        |
|    agent/train/explained_variance    | 0.944        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0194       |
|    agent/train/n_updates             | 240          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 37.5        |
|    agent/rollout/ep_rew_wrapped_mean | 69.9        |
|    agent/time/fps                    | 4472        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 53248       |
|    agent/train/approx_kl             | 0.005485625 |
|    agent/train/clip_fraction         | 0.214       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.611      |
|    agent/train/explained_variance    | 0.96        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0202      |
|    agent/train/n_updates             | 250         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 47.2        |
|    agent/rollout/ep_rew_wrapped_mean | 73.1        |
|    agent/time/fps                    | 4370        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 55296       |
|    agent/train/approx_kl             | 0.004053499 |
|    agent/train/clip_fraction         | 0.216       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.624      |
|    agent/train/explained_variance    | 0.98        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.000228   |
|    agent/train/n_updates             | 260         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 52.1         |
|    agent/rollout/ep_rew_wrapped_mean | 76.9         |
|    agent/time/fps                    | 4248         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 57344        |
|    agent/train/approx_kl             | 0.0042749755 |
|    agent/train/clip_fraction         | 0.162        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.589       |
|    agent/train/explained_variance    | 0.984        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00789      |
|    agent/train/n_updates             | 270          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 58           |
|    agent/rollout/ep_rew_wrapped_mean | 79.8         |
|    agent/time/fps                    | 4023         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 59392        |
|    agent/train/approx_kl             | 0.0025775097 |
|    agent/train/clip_fraction         | 0.131        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.623       |
|    agent/train/explained_variance    | 0.976        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00266     |
|    agent/train/n_updates             | 280          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 66           |
|    agent/rollout/ep_rew_wrapped_mean | 84.8         |
|    agent/time/fps                    | 4377         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 61440        |
|    agent/train/approx_kl             | 0.0023196442 |
|    agent/train/clip_fraction         | 0.13         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.607       |
|    agent/train/explained_variance    | 0.984        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.000342     |
|    agent/train/n_updates             | 290          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 75.4         |
|    agent/rollout/ep_rew_wrapped_mean | 89.9         |
|    agent/time/fps                    | 4226         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 63488        |
|    agent/train/approx_kl             | 0.0025004386 |
|    agent/train/clip_fraction         | 0.118        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.574       |
|    agent/train/explained_variance    | 0.982        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00921     |
|    agent/train/n_updates             | 300          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 83           |
|    agent/rollout/ep_rew_wrapped_mean | 97.3         |
|    agent/time/fps                    | 4147         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 65536        |
|    agent/train/approx_kl             | 0.0034603581 |
|    agent/train/clip_fraction         | 0.105        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.587       |
|    agent/train/explained_variance    | 0.916        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0295       |
|    agent/train/n_updates             | 310          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 95.5         |
|    agent/rollout/ep_rew_wrapped_mean | 103          |
|    agent/time/fps                    | 4139         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 67584        |
|    agent/train/approx_kl             | 0.0024436447 |
|    agent/train/clip_fraction         | 0.144        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.59        |
|    agent/train/explained_variance    | 0.941        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0362       |
|    agent/train/n_updates             | 320          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 104          |
|    agent/rollout/ep_rew_wrapped_mean | 112          |
|    agent/time/fps                    | 4103         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 69632        |
|    agent/train/approx_kl             | 0.0059164483 |
|    agent/train/clip_fraction         | 0.199        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.567       |
|    agent/train/explained_variance    | 0.919        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0272       |
|    agent/train/n_updates             | 330          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 115         |
|    agent/rollout/ep_rew_wrapped_mean | 121         |
|    agent/time/fps                    | 4093        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 71680       |
|    agent/train/approx_kl             | 0.003443717 |
|    agent/train/clip_fraction         | 0.144       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.57       |
|    agent/train/explained_variance    | 0.97        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0461      |
|    agent/train/n_updates             | 340         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 124          |
|    agent/rollout/ep_rew_wrapped_mean | 132          |
|    agent/time/fps                    | 3918         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 73728        |
|    agent/train/approx_kl             | 0.0034167864 |
|    agent/train/clip_fraction         | 0.151        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.555       |
|    agent/train/explained_variance    | 0.973        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0558       |
|    agent/train/n_updates             | 350          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 141          |
|    agent/rollout/ep_rew_wrapped_mean | 142          |
|    agent/time/fps                    | 4177         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 75776        |
|    agent/train/approx_kl             | 0.0033588645 |
|    agent/train/clip_fraction         | 0.139        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.547       |
|    agent/train/explained_variance    | 0.985        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00402      |
|    agent/train/n_updates             | 360          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 155          |
|    agent/rollout/ep_rew_wrapped_mean | 154          |
|    agent/time/fps                    | 4113         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 77824        |
|    agent/train/approx_kl             | 0.0026738364 |
|    agent/train/clip_fraction         | 0.137        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.521       |
|    agent/train/explained_variance    | 0.985        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00979      |
|    agent/train/n_updates             | 370          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 170          |
|    agent/rollout/ep_rew_wrapped_mean | 168          |
|    agent/time/fps                    | 4090         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 79872        |
|    agent/train/approx_kl             | 0.0037494197 |
|    agent/train/clip_fraction         | 0.139        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.511       |
|    agent/train/explained_variance    | 0.983        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.011        |
|    agent/train/n_updates             | 380          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 183          |
|    agent/rollout/ep_rew_wrapped_mean | 181          |
|    agent/time/fps                    | 4132         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 81920        |
|    agent/train/approx_kl             | 0.0029481435 |
|    agent/train/clip_fraction         | 0.141        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.501       |
|    agent/train/explained_variance    | 0.986        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00688      |
|    agent/train/n_updates             | 390          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 195         |
|    agent/rollout/ep_rew_wrapped_mean | 194         |
|    agent/time/fps                    | 4169        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 83968       |
|    agent/train/approx_kl             | 0.003551695 |
|    agent/train/clip_fraction         | 0.123       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.465      |
|    agent/train/explained_variance    | 0.973       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.000468   |
|    agent/train/n_updates             | 400         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 207          |
|    agent/rollout/ep_rew_wrapped_mean | 205          |
|    agent/time/fps                    | 3964         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 86016        |
|    agent/train/approx_kl             | 0.0023034506 |
|    agent/train/clip_fraction         | 0.13         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.491       |
|    agent/train/explained_variance    | 0.984        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0141      |
|    agent/train/n_updates             | 410          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 221          |
|    agent/rollout/ep_rew_wrapped_mean | 218          |
|    agent/time/fps                    | 2957         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 88064        |
|    agent/train/approx_kl             | 0.0017805239 |
|    agent/train/clip_fraction         | 0.11         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.459       |
|    agent/train/explained_variance    | 0.991        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00274      |
|    agent/train/n_updates             | 420          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 228          |
|    agent/rollout/ep_rew_wrapped_mean | 230          |
|    agent/time/fps                    | 3779         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 90112        |
|    agent/train/approx_kl             | 0.0026540493 |
|    agent/train/clip_fraction         | 0.129        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.45        |
|    agent/train/explained_variance    | 0.99         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0004       |
|    agent/train/n_updates             | 430          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 238          |
|    agent/rollout/ep_rew_wrapped_mean | 243          |
|    agent/time/fps                    | 3797         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 92160        |
|    agent/train/approx_kl             | 0.0036763847 |
|    agent/train/clip_fraction         | 0.123        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.472       |
|    agent/train/explained_variance    | 0.994        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00538     |
|    agent/train/n_updates             | 440          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 248          |
|    agent/rollout/ep_rew_wrapped_mean | 262          |
|    agent/time/fps                    | 4190         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 94208        |
|    agent/train/approx_kl             | 0.0023421498 |
|    agent/train/clip_fraction         | 0.142        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.442       |
|    agent/train/explained_variance    | 0.994        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00197      |
|    agent/train/n_updates             | 450          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 256          |
|    agent/rollout/ep_rew_wrapped_mean | 277          |
|    agent/time/fps                    | 4037         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 96256        |
|    agent/train/approx_kl             | 0.0046867733 |
|    agent/train/clip_fraction         | 0.108        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.428       |
|    agent/train/explained_variance    | 0.996        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00783      |
|    agent/train/n_updates             | 460          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 266          |
|    agent/rollout/ep_rew_wrapped_mean | 289          |
|    agent/time/fps                    | 4183         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 98304        |
|    agent/train/approx_kl             | 0.0029524048 |
|    agent/train/clip_fraction         | 0.136        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.46        |
|    agent/train/explained_variance    | 0.99         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00233     |
|    agent/train/n_updates             | 470          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 276          |
|    agent/rollout/ep_rew_wrapped_mean | 307          |
|    agent/time/fps                    | 3980         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 100352       |
|    agent/train/approx_kl             | 0.0027109897 |
|    agent/train/clip_fraction         | 0.13         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.435       |
|    agent/train/explained_variance    | 0.987        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0131       |
|    agent/train/n_updates             | 480          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 288         |
|    agent/rollout/ep_rew_wrapped_mean | 328         |
|    agent/time/fps                    | 4177        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 102400      |
|    agent/train/approx_kl             | 0.003608333 |
|    agent/train/clip_fraction         | 0.163       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.413      |
|    agent/train/explained_variance    | 0.992       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00404     |
|    agent/train/n_updates             | 490         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 297          |
|    agent/rollout/ep_rew_wrapped_mean | 346          |
|    agent/time/fps                    | 4176         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 104448       |
|    agent/train/approx_kl             | 0.0014518197 |
|    agent/train/clip_fraction         | 0.121        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.378       |
|    agent/train/explained_variance    | 0.981        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0156       |
|    agent/train/n_updates             | 500          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 304          |
|    agent/rollout/ep_rew_wrapped_mean | 362          |
|    agent/time/fps                    | 4261         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 106496       |
|    agent/train/approx_kl             | 0.0019121352 |
|    agent/train/clip_fraction         | 0.121        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.383       |
|    agent/train/explained_variance    | 0.974        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0113       |
|    agent/train/n_updates             | 510          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 316          |
|    agent/rollout/ep_rew_wrapped_mean | 379          |
|    agent/time/fps                    | 4186         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 108544       |
|    agent/train/approx_kl             | 0.0024394523 |
|    agent/train/clip_fraction         | 0.106        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.377       |
|    agent/train/explained_variance    | 0.901        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0101       |
|    agent/train/n_updates             | 520          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 328        |
|    agent/rollout/ep_rew_wrapped_mean | 395        |
|    agent/time/fps                    | 4137       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 110592     |
|    agent/train/approx_kl             | 0.00307302 |
|    agent/train/clip_fraction         | 0.101      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.374     |
|    agent/train/explained_variance    | 0.904      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.00725    |
|    agent/train/n_updates             | 530        |
|    agent/train/policy_gradient_loss  | 0.00228  

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 338         |
|    agent/rollout/ep_rew_wrapped_mean | 410         |
|    agent/time/fps                    | 4047        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 112640      |
|    agent/train/approx_kl             | 0.001127575 |
|    agent/train/clip_fraction         | 0.116       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.354      |
|    agent/train/explained_variance    | 0.976       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0118      |
|    agent/train/n_updates             | 540         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 347          |
|    agent/rollout/ep_rew_wrapped_mean | 428          |
|    agent/time/fps                    | 3944         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 114688       |
|    agent/train/approx_kl             | 0.0036612444 |
|    agent/train/clip_fraction         | 0.116        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.371       |
|    agent/train/explained_variance    | 0.978        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0205       |
|    agent/train/n_updates             | 550          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 358          |
|    agent/rollout/ep_rew_wrapped_mean | 446          |
|    agent/time/fps                    | 4029         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 116736       |
|    agent/train/approx_kl             | 0.0015718809 |
|    agent/train/clip_fraction         | 0.108        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.349       |
|    agent/train/explained_variance    | 0.942        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00449      |
|    agent/train/n_updates             | 560          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 365          |
|    agent/rollout/ep_rew_wrapped_mean | 465          |
|    agent/time/fps                    | 3832         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 118784       |
|    agent/train/approx_kl             | 0.0032928684 |
|    agent/train/clip_fraction         | 0.127        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.334       |
|    agent/train/explained_variance    | 0.973        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0283       |
|    agent/train/n_updates             | 570          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 375          |
|    agent/rollout/ep_rew_wrapped_mean | 486          |
|    agent/time/fps                    | 4070         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 120832       |
|    agent/train/approx_kl             | 0.0045521893 |
|    agent/train/clip_fraction         | 0.142        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.33        |
|    agent/train/explained_variance    | 0.978        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0021       |
|    agent/train/n_updates             | 580          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 103 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 384          |
|    agent/rollout/ep_rew_wrapped_mean | 508          |
|    agent/time/fps                    | 4011         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 122880       |
|    agent/train/approx_kl             | 0.0034010238 |
|    agent/train/clip_fraction         | 0.155        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.323       |
|    agent/train/explained_variance    | 0.967        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00703      |
|    agent/train/n_updates             | 590          |
|    agent/trai

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 394          |
|    agent/rollout/ep_rew_wrapped_mean | 522          |
|    agent/time/fps                    | 3896         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 124928       |
|    agent/train/approx_kl             | 0.0026966473 |
|    agent/train/clip_fraction         | 0.107        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.325       |
|    agent/train/explained_variance    | 0.968        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.016        |
|    agent/train/n_updates             | 600          |
|    agent/train



VBox(children=(Label(value='0.112 MB of 0.112 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▁▁▁▃▄▆██████████████████████████████████
time/fps,█▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/approx_kl,▂▃▄▆▃▄▃▆█▇▃▆▃▄▂▅▂▅▄▂▂▃▃▄▃▂▆▅▄▃▂▆▂▃▂▂▂▃▄▁
train/clip_fraction,▄▅▆█▅▅▄▄▇▄▃▄▃▃▃▃▃▃▃▃▃▂▃▃▂▂▃▂▂▂▂▃▂▃▂▂▂▃▂▁
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▂▄▆▇▇█▇▇████████████████████████████▇█
train/explained_variance,▁▅▇▅████████████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,500.0
time/fps,2858.0
train/approx_kl,0.00115
train/clip_fraction,0.05542
train/clip_range,0.1
train/entropy_loss,-0.16661
train/explained_variance,0.99942
train/learning_rate,0.002


 40%|█████████████████▌                          | 2/5 [10:39<15:57, 319.21s/it]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011224066667879621, max=1.0…

Query schedule: [30, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3]
Collecting 60 fragments (12000 transitions)
Requested 7200 transitions but only 0 in buffer. Sampling 7200 additional transitions.
Sampling 4800 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 30 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 43.5     |
|    agent/rollout/ep_rew_wrapped_mean | -52.1    |
|    agent/time/fps                    | 4475     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 43.5     |
|    agent/rollout/ep_rew_wrapped_mean | -52.1    |
|    agent/time/fps                    | 4.48e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.9         |
|    agent/rollout/ep_rew_wrapped_mean | -49.8        |
|    agent/time/fps                    | 4162         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0047592074 |
|    agent/train/clip_fraction         | 0.267        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.689       |
|    agent/train/explained_variance    | 0.0964       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00891      |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.4         |
|    agent/rollout/ep_rew_wrapped_mean | -52          |
|    agent/time/fps                    | 4520         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0017087054 |
|    agent/train/clip_fraction         | 0.0351       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.683       |
|    agent/train/explained_variance    | -0.741       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.037        |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.2         |
|    agent/rollout/ep_rew_wrapped_mean | -52.8        |
|    agent/time/fps                    | 4510         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0037705041 |
|    agent/train/clip_fraction         | 0.192        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.69        |
|    agent/train/explained_variance    | 0.777        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00801     |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.9         |
|    agent/rollout/ep_rew_wrapped_mean | -55.1        |
|    agent/time/fps                    | 4466         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0037186325 |
|    agent/train/clip_fraction         | 0.266        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.692       |
|    agent/train/explained_variance    | 0.87         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.017        |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.6         |
|    agent/rollout/ep_rew_wrapped_mean | -55.6        |
|    agent/time/fps                    | 4327         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0040904637 |
|    agent/train/clip_fraction         | 0.253        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.69        |
|    agent/train/explained_variance    | 0.922        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00525      |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.6         |
|    agent/rollout/ep_rew_wrapped_mean | -59.2        |
|    agent/time/fps                    | 4484         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0036509533 |
|    agent/train/clip_fraction         | 0.165        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.687       |
|    agent/train/explained_variance    | -0.552       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0114       |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.6         |
|    agent/rollout/ep_rew_wrapped_mean | -66.3        |
|    agent/time/fps                    | 4291         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0018073358 |
|    agent/train/clip_fraction         | 0.0462       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.686       |
|    agent/train/explained_variance    | 0.541        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00624      |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26.4         |
|    agent/rollout/ep_rew_wrapped_mean | -70.4        |
|    agent/time/fps                    | 4321         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0019248391 |
|    agent/train/clip_fraction         | 0.0466       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.685       |
|    agent/train/explained_variance    | 0.934        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0127       |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 26           |
|    agent/rollout/ep_rew_wrapped_mean | -72.3        |
|    agent/time/fps                    | 4421         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0030894643 |
|    agent/train/clip_fraction         | 0.135        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.684       |
|    agent/train/explained_variance    | 0.984        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0104      |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.9         |
|    agent/rollout/ep_rew_wrapped_mean | -71.7        |
|    agent/time/fps                    | 4179         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0014986342 |
|    agent/train/clip_fraction         | 0.0662       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.686       |
|    agent/train/explained_variance    | 0.84         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0043      |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.6         |
|    agent/rollout/ep_rew_wrapped_mean | -70.1        |
|    agent/time/fps                    | 4487         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0036319986 |
|    agent/train/clip_fraction         | 0.239        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.681       |
|    agent/train/explained_variance    | 0.972        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.029       |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 27.4       |
|    agent/rollout/ep_rew_wrapped_mean | -68.5      |
|    agent/time/fps                    | 4418       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 26624      |
|    agent/train/approx_kl             | 0.00349759 |
|    agent/train/clip_fraction         | 0.167      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.672     |
|    agent/train/explained_variance    | 0.579      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.00091   |
|    agent/train/n_updates             | 120        |
|    agent/train/policy_gradient_loss  | -0.00414 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.7         |
|    agent/rollout/ep_rew_wrapped_mean | -71.2        |
|    agent/time/fps                    | 4122         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 28672        |
|    agent/train/approx_kl             | 0.0031412002 |
|    agent/train/clip_fraction         | 0.127        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.664       |
|    agent/train/explained_variance    | 0.389        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00582      |
|    agent/train/n_updates             | 130          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.5         |
|    agent/rollout/ep_rew_wrapped_mean | -67.4        |
|    agent/time/fps                    | 4368         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0025950028 |
|    agent/train/clip_fraction         | 0.136        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.671       |
|    agent/train/explained_variance    | 0.684        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0116      |
|    agent/train/n_updates             | 140          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 31.2        |
|    agent/rollout/ep_rew_wrapped_mean | -67.5       |
|    agent/time/fps                    | 4464        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 32768       |
|    agent/train/approx_kl             | 0.003118452 |
|    agent/train/clip_fraction         | 0.143       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.659      |
|    agent/train/explained_variance    | 0.778       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00793     |
|    agent/train/n_updates             | 150         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 32.6        |
|    agent/rollout/ep_rew_wrapped_mean | -64.5       |
|    agent/time/fps                    | 4181        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 34816       |
|    agent/train/approx_kl             | 0.003004945 |
|    agent/train/clip_fraction         | 0.138       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.654      |
|    agent/train/explained_variance    | 0.867       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0151     |
|    agent/train/n_updates             | 160         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 34.2         |
|    agent/rollout/ep_rew_wrapped_mean | -65.3        |
|    agent/time/fps                    | 4446         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 36864        |
|    agent/train/approx_kl             | 0.0026928762 |
|    agent/train/clip_fraction         | 0.131        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.641       |
|    agent/train/explained_variance    | 0.891        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0147       |
|    agent/train/n_updates             | 170          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 35.4        |
|    agent/rollout/ep_rew_wrapped_mean | -65.8       |
|    agent/time/fps                    | 4238        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 38912       |
|    agent/train/approx_kl             | 0.001625824 |
|    agent/train/clip_fraction         | 0.0787      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.647      |
|    agent/train/explained_variance    | 0.899       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0382      |
|    agent/train/n_updates             | 180         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 35.7         |
|    agent/rollout/ep_rew_wrapped_mean | -65.4        |
|    agent/time/fps                    | 4173         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 40960        |
|    agent/train/approx_kl             | 0.0028463323 |
|    agent/train/clip_fraction         | 0.1          |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.657       |
|    agent/train/explained_variance    | 0.929        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00136     |
|    agent/train/n_updates             | 190          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 36.4         |
|    agent/rollout/ep_rew_wrapped_mean | -65.8        |
|    agent/time/fps                    | 4337         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0027739294 |
|    agent/train/clip_fraction         | 0.165        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.656       |
|    agent/train/explained_variance    | 0.881        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.123        |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 37.8         |
|    agent/rollout/ep_rew_wrapped_mean | -65.7        |
|    agent/time/fps                    | 4447         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 45056        |
|    agent/train/approx_kl             | 0.0029522812 |
|    agent/train/clip_fraction         | 0.0986       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.658       |
|    agent/train/explained_variance    | 0.93         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0821       |
|    agent/train/n_updates             | 210          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 39.3        |
|    agent/rollout/ep_rew_wrapped_mean | -65.2       |
|    agent/time/fps                    | 4382        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 47104       |
|    agent/train/approx_kl             | 0.003140887 |
|    agent/train/clip_fraction         | 0.149       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.653      |
|    agent/train/explained_variance    | 0.949       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0733      |
|    agent/train/n_updates             | 220         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 40.6        |
|    agent/rollout/ep_rew_wrapped_mean | -62.3       |
|    agent/time/fps                    | 4038        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 49152       |
|    agent/train/approx_kl             | 0.003149239 |
|    agent/train/clip_fraction         | 0.175       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.657      |
|    agent/train/explained_variance    | 0.969       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0318      |
|    agent/train/n_updates             | 230         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 41           |
|    agent/rollout/ep_rew_wrapped_mean | -57.5        |
|    agent/time/fps                    | 4308         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 51200        |
|    agent/train/approx_kl             | 0.0031740256 |
|    agent/train/clip_fraction         | 0.147        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.642       |
|    agent/train/explained_variance    | 0.947        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0271       |
|    agent/train/n_updates             | 240          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 42.5         |
|    agent/rollout/ep_rew_wrapped_mean | -53.8        |
|    agent/time/fps                    | 4441         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 53248        |
|    agent/train/approx_kl             | 0.0034168721 |
|    agent/train/clip_fraction         | 0.181        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.64        |
|    agent/train/explained_variance    | 0.921        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.136        |
|    agent/train/n_updates             | 250          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 45.1         |
|    agent/rollout/ep_rew_wrapped_mean | -50.6        |
|    agent/time/fps                    | 4170         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 55296        |
|    agent/train/approx_kl             | 0.0032818092 |
|    agent/train/clip_fraction         | 0.179        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.653       |
|    agent/train/explained_variance    | 0.967        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0376       |
|    agent/train/n_updates             | 260          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 46.3        |
|    agent/rollout/ep_rew_wrapped_mean | -49.2       |
|    agent/time/fps                    | 3880        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 57344       |
|    agent/train/approx_kl             | 0.003844753 |
|    agent/train/clip_fraction         | 0.193       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.638      |
|    agent/train/explained_variance    | 0.974       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0533      |
|    agent/train/n_updates             | 270         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 47.3        |
|    agent/rollout/ep_rew_wrapped_mean | -48.8       |
|    agent/time/fps                    | 4207        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 59392       |
|    agent/train/approx_kl             | 0.003780378 |
|    agent/train/clip_fraction         | 0.247       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.632      |
|    agent/train/explained_variance    | 0.978       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.015      |
|    agent/train/n_updates             | 280         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 47.5         |
|    agent/rollout/ep_rew_wrapped_mean | -47.4        |
|    agent/time/fps                    | 4187         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 61440        |
|    agent/train/approx_kl             | 0.0050611766 |
|    agent/train/clip_fraction         | 0.185        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.6         |
|    agent/train/explained_variance    | 0.968        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.104        |
|    agent/train/n_updates             | 290          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 49.7         |
|    agent/rollout/ep_rew_wrapped_mean | -46.8        |
|    agent/time/fps                    | 4067         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 63488        |
|    agent/train/approx_kl             | 0.0040274477 |
|    agent/train/clip_fraction         | 0.27         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.626       |
|    agent/train/explained_variance    | 0.989        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0106      |
|    agent/train/n_updates             | 300          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 52.2        |
|    agent/rollout/ep_rew_wrapped_mean | -46.2       |
|    agent/time/fps                    | 4260        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 65536       |
|    agent/train/approx_kl             | 0.005235891 |
|    agent/train/clip_fraction         | 0.204       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.569      |
|    agent/train/explained_variance    | 0.945       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.13        |
|    agent/train/n_updates             | 310         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 55.9        |
|    agent/rollout/ep_rew_wrapped_mean | -44.9       |
|    agent/time/fps                    | 3580        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 67584       |
|    agent/train/approx_kl             | 0.004379912 |
|    agent/train/clip_fraction         | 0.18        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.599      |
|    agent/train/explained_variance    | 0.987       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0286      |
|    agent/train/n_updates             | 320         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 59.3        |
|    agent/rollout/ep_rew_wrapped_mean | -43.7       |
|    agent/time/fps                    | 4427        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 69632       |
|    agent/train/approx_kl             | 0.003167153 |
|    agent/train/clip_fraction         | 0.154       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.59       |
|    agent/train/explained_variance    | 0.988       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0596      |
|    agent/train/n_updates             | 330         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 62.7         |
|    agent/rollout/ep_rew_wrapped_mean | -41          |
|    agent/time/fps                    | 2588         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 71680        |
|    agent/train/approx_kl             | 0.0019505762 |
|    agent/train/clip_fraction         | 0.13         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.596       |
|    agent/train/explained_variance    | 0.977        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0419       |
|    agent/train/n_updates             | 340          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 66.2         |
|    agent/rollout/ep_rew_wrapped_mean | -39.1        |
|    agent/time/fps                    | 4051         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 73728        |
|    agent/train/approx_kl             | 0.0052232617 |
|    agent/train/clip_fraction         | 0.182        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.594       |
|    agent/train/explained_variance    | 0.987        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0358       |
|    agent/train/n_updates             | 350          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 69.7         |
|    agent/rollout/ep_rew_wrapped_mean | -35          |
|    agent/time/fps                    | 4065         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 75776        |
|    agent/train/approx_kl             | 0.0039774985 |
|    agent/train/clip_fraction         | 0.142        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.586       |
|    agent/train/explained_variance    | 0.975        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.173        |
|    agent/train/n_updates             | 360          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 74.1         |
|    agent/rollout/ep_rew_wrapped_mean | -32          |
|    agent/time/fps                    | 4038         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 77824        |
|    agent/train/approx_kl             | 0.0038168444 |
|    agent/train/clip_fraction         | 0.171        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.564       |
|    agent/train/explained_variance    | 0.947        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.116        |
|    agent/train/n_updates             | 370          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 77.9         |
|    agent/rollout/ep_rew_wrapped_mean | -27.7        |
|    agent/time/fps                    | 3845         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 79872        |
|    agent/train/approx_kl             | 0.0032549975 |
|    agent/train/clip_fraction         | 0.181        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.581       |
|    agent/train/explained_variance    | 0.984        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.073        |
|    agent/train/n_updates             | 380          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 81.6         |
|    agent/rollout/ep_rew_wrapped_mean | -23.8        |
|    agent/time/fps                    | 4229         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 81920        |
|    agent/train/approx_kl             | 0.0041273534 |
|    agent/train/clip_fraction         | 0.19         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.573       |
|    agent/train/explained_variance    | 0.989        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.146        |
|    agent/train/n_updates             | 390          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 87.6        |
|    agent/rollout/ep_rew_wrapped_mean | -19.8       |
|    agent/time/fps                    | 4228        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 83968       |
|    agent/train/approx_kl             | 0.003562223 |
|    agent/train/clip_fraction         | 0.193       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.583      |
|    agent/train/explained_variance    | 0.988       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0397      |
|    agent/train/n_updates             | 400         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 92.2         |
|    agent/rollout/ep_rew_wrapped_mean | -14.7        |
|    agent/time/fps                    | 4082         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 86016        |
|    agent/train/approx_kl             | 0.0045280145 |
|    agent/train/clip_fraction         | 0.196        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.552       |
|    agent/train/explained_variance    | 0.989        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.124        |
|    agent/train/n_updates             | 410          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 98.4        |
|    agent/rollout/ep_rew_wrapped_mean | -8.99       |
|    agent/time/fps                    | 4002        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 88064       |
|    agent/train/approx_kl             | 0.006448485 |
|    agent/train/clip_fraction         | 0.197       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.536      |
|    agent/train/explained_variance    | 0.988       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.168       |
|    agent/train/n_updates             | 420         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 104          |
|    agent/rollout/ep_rew_wrapped_mean | -3.33        |
|    agent/time/fps                    | 4228         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 90112        |
|    agent/train/approx_kl             | 0.0033994145 |
|    agent/train/clip_fraction         | 0.166        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.549       |
|    agent/train/explained_variance    | 0.969        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.339        |
|    agent/train/n_updates             | 430          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 111          |
|    agent/rollout/ep_rew_wrapped_mean | 3.55         |
|    agent/time/fps                    | 4215         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 92160        |
|    agent/train/approx_kl             | 0.0043193228 |
|    agent/train/clip_fraction         | 0.196        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.561       |
|    agent/train/explained_variance    | 0.972        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.112        |
|    agent/train/n_updates             | 440          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 116         |
|    agent/rollout/ep_rew_wrapped_mean | 10.2        |
|    agent/time/fps                    | 4237        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 94208       |
|    agent/train/approx_kl             | 0.004824093 |
|    agent/train/clip_fraction         | 0.149       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.53       |
|    agent/train/explained_variance    | 0.978       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.221       |
|    agent/train/n_updates             | 450         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 122          |
|    agent/rollout/ep_rew_wrapped_mean | 14.4         |
|    agent/time/fps                    | 4322         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 96256        |
|    agent/train/approx_kl             | 0.0031031538 |
|    agent/train/clip_fraction         | 0.149        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.571       |
|    agent/train/explained_variance    | 0.982        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.134        |
|    agent/train/n_updates             | 460          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 131          |
|    agent/rollout/ep_rew_wrapped_mean | 21.4         |
|    agent/time/fps                    | 4230         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 98304        |
|    agent/train/approx_kl             | 0.0045919684 |
|    agent/train/clip_fraction         | 0.195        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.546       |
|    agent/train/explained_variance    | 0.995        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0298       |
|    agent/train/n_updates             | 470          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 138          |
|    agent/rollout/ep_rew_wrapped_mean | 32.9         |
|    agent/time/fps                    | 4169         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 100352       |
|    agent/train/approx_kl             | 0.0024258238 |
|    agent/train/clip_fraction         | 0.124        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.509       |
|    agent/train/explained_variance    | 0.994        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.064        |
|    agent/train/n_updates             | 480          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 146          |
|    agent/rollout/ep_rew_wrapped_mean | 38.8         |
|    agent/time/fps                    | 4114         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 102400       |
|    agent/train/approx_kl             | 0.0034980595 |
|    agent/train/clip_fraction         | 0.206        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.535       |
|    agent/train/explained_variance    | 0.994        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0916       |
|    agent/train/n_updates             | 490          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 153          |
|    agent/rollout/ep_rew_wrapped_mean | 49.8         |
|    agent/time/fps                    | 4044         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 104448       |
|    agent/train/approx_kl             | 0.0029771847 |
|    agent/train/clip_fraction         | 0.144        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.499       |
|    agent/train/explained_variance    | 0.993        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.261        |
|    agent/train/n_updates             | 500          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 163          |
|    agent/rollout/ep_rew_wrapped_mean | 59           |
|    agent/time/fps                    | 4033         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 106496       |
|    agent/train/approx_kl             | 0.0034232803 |
|    agent/train/clip_fraction         | 0.151        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.518       |
|    agent/train/explained_variance    | 0.997        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0474       |
|    agent/train/n_updates             | 510          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 174          |
|    agent/rollout/ep_rew_wrapped_mean | 74.3         |
|    agent/time/fps                    | 4155         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 108544       |
|    agent/train/approx_kl             | 0.0056746555 |
|    agent/train/clip_fraction         | 0.177        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.511       |
|    agent/train/explained_variance    | 0.997        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0464       |
|    agent/train/n_updates             | 520          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 183          |
|    agent/rollout/ep_rew_wrapped_mean | 85           |
|    agent/time/fps                    | 4152         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 110592       |
|    agent/train/approx_kl             | 0.0036588917 |
|    agent/train/clip_fraction         | 0.194        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.486       |
|    agent/train/explained_variance    | 0.997        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0578       |
|    agent/train/n_updates             | 530          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 191          |
|    agent/rollout/ep_rew_wrapped_mean | 96.5         |
|    agent/time/fps                    | 3618         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 112640       |
|    agent/train/approx_kl             | 0.0070935246 |
|    agent/train/clip_fraction         | 0.189        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.502       |
|    agent/train/explained_variance    | 0.998        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0446       |
|    agent/train/n_updates             | 540          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 200          |
|    agent/rollout/ep_rew_wrapped_mean | 106          |
|    agent/time/fps                    | 3929         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 114688       |
|    agent/train/approx_kl             | 0.0034126528 |
|    agent/train/clip_fraction         | 0.197        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.511       |
|    agent/train/explained_variance    | 0.991        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0976       |
|    agent/train/n_updates             | 550          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 214          |
|    agent/rollout/ep_rew_wrapped_mean | 118          |
|    agent/time/fps                    | 4063         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 116736       |
|    agent/train/approx_kl             | 0.0034909404 |
|    agent/train/clip_fraction         | 0.184        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.499       |
|    agent/train/explained_variance    | 0.997        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0129       |
|    agent/train/n_updates             | 560          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 224          |
|    agent/rollout/ep_rew_wrapped_mean | 134          |
|    agent/time/fps                    | 4083         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 118784       |
|    agent/train/approx_kl             | 0.0028757672 |
|    agent/train/clip_fraction         | 0.172        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.477       |
|    agent/train/explained_variance    | 0.985        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.111        |
|    agent/train/n_updates             | 570          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 233          |
|    agent/rollout/ep_rew_wrapped_mean | 147          |
|    agent/time/fps                    | 4010         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 120832       |
|    agent/train/approx_kl             | 0.0036772804 |
|    agent/train/clip_fraction         | 0.159        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.482       |
|    agent/train/explained_variance    | 0.991        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.067        |
|    agent/train/n_updates             | 580          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 103 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 245          |
|    agent/rollout/ep_rew_wrapped_mean | 162          |
|    agent/time/fps                    | 4007         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 122880       |
|    agent/train/approx_kl             | 0.0046630064 |
|    agent/train/clip_fraction         | 0.236        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.481       |
|    agent/train/explained_variance    | 0.997        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0421       |
|    agent/train/n_updates             | 590          |
|    agent/trai

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 255         |
|    agent/rollout/ep_rew_wrapped_mean | 174         |
|    agent/time/fps                    | 4153        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 124928      |
|    agent/train/approx_kl             | 0.005355711 |
|    agent/train/clip_fraction         | 0.207       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.47       |
|    agent/train/explained_variance    | 0.992       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0227      |
|    agent/train/n_updates             | 600         |
|    agent/train/policy_gradient_



VBox(children=(Label(value='0.159 MB of 0.159 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▁▁▁▁▁▁▂▃▄▆▇█████████████████████████████
time/fps,█▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
train/approx_kl,▃▃▆▆▃▅▇▄█▅▂▆▁▄▁▆▃▃▂▁▂▄▂▃▂▂▆▇▁▅▂▃▁▂▂▄▃▄▄▂
train/clip_fraction,▇▄█▆▄▅▆▅▅▃▃▄▃▃▂▂▃▁▂▂▂▃▂▂▂▂▂▃▁▂▁▂▂▂▁▂▁▂▂▁
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▁▂▃▄▄▃▃▄▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇▇█▇██▇▇▇████
train/explained_variance,▁▇▇▇████████████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,500.0
time/fps,2722.0
train/approx_kl,0.00267
train/clip_fraction,0.08398
train/clip_range,0.1
train/entropy_loss,-0.16206
train/explained_variance,0.9997
train/learning_rate,0.002


 60%|██████████████████████████▍                 | 3/5 [16:05<10:45, 322.68s/it]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011171586110463573, max=1.0…

Query schedule: [30, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3]
Collecting 60 fragments (12000 transitions)
Requested 7200 transitions but only 0 in buffer. Sampling 7200 additional transitions.
Sampling 4800 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 30 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 28.8     |
|    agent/rollout/ep_rew_wrapped_mean | 344      |
|    agent/time/fps                    | 4374     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 28.8     |
|    agent/rollout/ep_rew_wrapped_mean | 344      |
|    agent/time/fps                    | 4.37e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.9         |
|    agent/rollout/ep_rew_wrapped_mean | 266          |
|    agent/time/fps                    | 4332         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0038347011 |
|    agent/train/clip_fraction         | 0.151        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.69        |
|    agent/train/explained_variance    | -0.0979      |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00655     |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31           |
|    agent/rollout/ep_rew_wrapped_mean | 222          |
|    agent/time/fps                    | 4513         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0029709148 |
|    agent/train/clip_fraction         | 0.185        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.692       |
|    agent/train/explained_variance    | 0.577        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.012       |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 30.2        |
|    agent/rollout/ep_rew_wrapped_mean | 191         |
|    agent/time/fps                    | 4433        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 8192        |
|    agent/train/approx_kl             | 0.003031287 |
|    agent/train/clip_fraction         | 0.137       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.686      |
|    agent/train/explained_variance    | 0.899       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00786    |
|    agent/train/n_updates             | 30          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 33          |
|    agent/rollout/ep_rew_wrapped_mean | 168         |
|    agent/time/fps                    | 4477        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 10240       |
|    agent/train/approx_kl             | 0.004006882 |
|    agent/train/clip_fraction         | 0.299       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.682      |
|    agent/train/explained_variance    | 0.941       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0353     |
|    agent/train/n_updates             | 40          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 34.7         |
|    agent/rollout/ep_rew_wrapped_mean | 151          |
|    agent/time/fps                    | 3895         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0030517145 |
|    agent/train/clip_fraction         | 0.193        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.673       |
|    agent/train/explained_variance    | 0.727        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0173      |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 35.1         |
|    agent/rollout/ep_rew_wrapped_mean | 138          |
|    agent/time/fps                    | 3272         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0035410784 |
|    agent/train/clip_fraction         | 0.187        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.662       |
|    agent/train/explained_variance    | 0.829        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0179      |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 35.2        |
|    agent/rollout/ep_rew_wrapped_mean | 127         |
|    agent/time/fps                    | 4230        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 16384       |
|    agent/train/approx_kl             | 0.002390188 |
|    agent/train/clip_fraction         | 0.0995      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.667      |
|    agent/train/explained_variance    | 0.878       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.019      |
|    agent/train/n_updates             | 70          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 34.4         |
|    agent/rollout/ep_rew_wrapped_mean | 117          |
|    agent/time/fps                    | 3892         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0022314086 |
|    agent/train/clip_fraction         | 0.121        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.66        |
|    agent/train/explained_variance    | 0.939        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00712     |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 34.2         |
|    agent/rollout/ep_rew_wrapped_mean | 109          |
|    agent/time/fps                    | 3603         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0030725475 |
|    agent/train/clip_fraction         | 0.15         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.67        |
|    agent/train/explained_variance    | 0.965        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0157      |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 34.8         |
|    agent/rollout/ep_rew_wrapped_mean | 102          |
|    agent/time/fps                    | 3865         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0034013903 |
|    agent/train/clip_fraction         | 0.195        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.658       |
|    agent/train/explained_variance    | 0.958        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00678     |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 35.4        |
|    agent/rollout/ep_rew_wrapped_mean | 96          |
|    agent/time/fps                    | 3967        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 24576       |
|    agent/train/approx_kl             | 0.003035172 |
|    agent/train/clip_fraction         | 0.142       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.65       |
|    agent/train/explained_variance    | 0.927       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0258     |
|    agent/train/n_updates             | 110         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 35.7       |
|    agent/rollout/ep_rew_wrapped_mean | 90         |
|    agent/time/fps                    | 3930       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 26624      |
|    agent/train/approx_kl             | 0.00297566 |
|    agent/train/clip_fraction         | 0.152      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.654     |
|    agent/train/explained_variance    | 0.943      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.00704   |
|    agent/train/n_updates             | 120        |
|    agent/train/policy_gradient_loss  | -0.00485 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 36.7         |
|    agent/rollout/ep_rew_wrapped_mean | 73.2         |
|    agent/time/fps                    | 3117         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 28672        |
|    agent/train/approx_kl             | 0.0010132056 |
|    agent/train/clip_fraction         | 0.0754       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.654       |
|    agent/train/explained_variance    | 0.966        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0133      |
|    agent/train/n_updates             | 130          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 37.1         |
|    agent/rollout/ep_rew_wrapped_mean | 56.9         |
|    agent/time/fps                    | 3172         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0024145334 |
|    agent/train/clip_fraction         | 0.131        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.651       |
|    agent/train/explained_variance    | 0.983        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00684     |
|    agent/train/n_updates             | 140          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 37.6         |
|    agent/rollout/ep_rew_wrapped_mean | 37.9         |
|    agent/time/fps                    | 3155         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0033742357 |
|    agent/train/clip_fraction         | 0.247        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.65        |
|    agent/train/explained_variance    | 0.991        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00581     |
|    agent/train/n_updates             | 150          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 38.3         |
|    agent/rollout/ep_rew_wrapped_mean | 10.4         |
|    agent/time/fps                    | 4076         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0020149788 |
|    agent/train/clip_fraction         | 0.0761       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.651       |
|    agent/train/explained_variance    | 0.92         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00285     |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 38.2        |
|    agent/rollout/ep_rew_wrapped_mean | -0.633      |
|    agent/time/fps                    | 4229        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 36864       |
|    agent/train/approx_kl             | 0.003137111 |
|    agent/train/clip_fraction         | 0.195       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.641      |
|    agent/train/explained_variance    | 0.979       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00381     |
|    agent/train/n_updates             | 170         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 37.9        |
|    agent/rollout/ep_rew_wrapped_mean | -1.32       |
|    agent/time/fps                    | 3978        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 38912       |
|    agent/train/approx_kl             | 0.003911157 |
|    agent/train/clip_fraction         | 0.284       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.649      |
|    agent/train/explained_variance    | 0.983       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00355    |
|    agent/train/n_updates             | 180         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 38.2         |
|    agent/rollout/ep_rew_wrapped_mean | -2.26        |
|    agent/time/fps                    | 3356         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 40960        |
|    agent/train/approx_kl             | 0.0039774133 |
|    agent/train/clip_fraction         | 0.232        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.648       |
|    agent/train/explained_variance    | 0.978        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0115      |
|    agent/train/n_updates             | 190          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 38.4        |
|    agent/rollout/ep_rew_wrapped_mean | -4.77       |
|    agent/time/fps                    | 3459        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 43008       |
|    agent/train/approx_kl             | 0.004016663 |
|    agent/train/clip_fraction         | 0.201       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.642      |
|    agent/train/explained_variance    | 0.965       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00328     |
|    agent/train/n_updates             | 200         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 38.3        |
|    agent/rollout/ep_rew_wrapped_mean | -8.63       |
|    agent/time/fps                    | 4223        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 45056       |
|    agent/train/approx_kl             | 0.004189791 |
|    agent/train/clip_fraction         | 0.169       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.642      |
|    agent/train/explained_variance    | 0.948       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0153      |
|    agent/train/n_updates             | 210         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 38.5        |
|    agent/rollout/ep_rew_wrapped_mean | -12.2       |
|    agent/time/fps                    | 4285        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 47104       |
|    agent/train/approx_kl             | 0.004086206 |
|    agent/train/clip_fraction         | 0.236       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.623      |
|    agent/train/explained_variance    | 0.992       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0138     |
|    agent/train/n_updates             | 220         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 38.7         |
|    agent/rollout/ep_rew_wrapped_mean | -15.2        |
|    agent/time/fps                    | 3988         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 49152        |
|    agent/train/approx_kl             | 0.0025905173 |
|    agent/train/clip_fraction         | 0.199        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.622       |
|    agent/train/explained_variance    | 0.94         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.005        |
|    agent/train/n_updates             | 230          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 39.6        |
|    agent/rollout/ep_rew_wrapped_mean | -19.2       |
|    agent/time/fps                    | 3859        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 51200       |
|    agent/train/approx_kl             | 0.002723199 |
|    agent/train/clip_fraction         | 0.152       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.616      |
|    agent/train/explained_variance    | 0.987       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00897     |
|    agent/train/n_updates             | 240         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 40.3         |
|    agent/rollout/ep_rew_wrapped_mean | -21.6        |
|    agent/time/fps                    | 4136         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 53248        |
|    agent/train/approx_kl             | 0.0034279488 |
|    agent/train/clip_fraction         | 0.186        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.615       |
|    agent/train/explained_variance    | 0.991        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00381     |
|    agent/train/n_updates             | 250          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 40.9        |
|    agent/rollout/ep_rew_wrapped_mean | -23.5       |
|    agent/time/fps                    | 3183        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 55296       |
|    agent/train/approx_kl             | 0.003704803 |
|    agent/train/clip_fraction         | 0.102       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.595      |
|    agent/train/explained_variance    | 0.978       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.022       |
|    agent/train/n_updates             | 260         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 41.4         |
|    agent/rollout/ep_rew_wrapped_mean | -26.6        |
|    agent/time/fps                    | 4334         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 57344        |
|    agent/train/approx_kl             | 0.0029328524 |
|    agent/train/clip_fraction         | 0.157        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.6         |
|    agent/train/explained_variance    | 0.989        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00342     |
|    agent/train/n_updates             | 270          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 41.5        |
|    agent/rollout/ep_rew_wrapped_mean | -29.1       |
|    agent/time/fps                    | 3864        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 59392       |
|    agent/train/approx_kl             | 0.004167754 |
|    agent/train/clip_fraction         | 0.196       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.586      |
|    agent/train/explained_variance    | 0.996       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00558     |
|    agent/train/n_updates             | 280         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 42.1         |
|    agent/rollout/ep_rew_wrapped_mean | -31.2        |
|    agent/time/fps                    | 4042         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 61440        |
|    agent/train/approx_kl             | 0.0028962216 |
|    agent/train/clip_fraction         | 0.118        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.573       |
|    agent/train/explained_variance    | 0.986        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0157      |
|    agent/train/n_updates             | 290          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 42.5         |
|    agent/rollout/ep_rew_wrapped_mean | -32.6        |
|    agent/time/fps                    | 3650         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 63488        |
|    agent/train/approx_kl             | 0.0024206457 |
|    agent/train/clip_fraction         | 0.12         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.57        |
|    agent/train/explained_variance    | 0.978        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00702      |
|    agent/train/n_updates             | 300          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 43           |
|    agent/rollout/ep_rew_wrapped_mean | -34.2        |
|    agent/time/fps                    | 4119         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 65536        |
|    agent/train/approx_kl             | 0.0034153322 |
|    agent/train/clip_fraction         | 0.151        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.568       |
|    agent/train/explained_variance    | 0.994        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00401      |
|    agent/train/n_updates             | 310          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 43.6        |
|    agent/rollout/ep_rew_wrapped_mean | -34.6       |
|    agent/time/fps                    | 3344        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 67584       |
|    agent/train/approx_kl             | 0.004879079 |
|    agent/train/clip_fraction         | 0.188       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.569      |
|    agent/train/explained_variance    | 0.925       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0165      |
|    agent/train/n_updates             | 320         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 44.3        |
|    agent/rollout/ep_rew_wrapped_mean | -35.9       |
|    agent/time/fps                    | 3553        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 69632       |
|    agent/train/approx_kl             | 0.007575739 |
|    agent/train/clip_fraction         | 0.254       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.552      |
|    agent/train/explained_variance    | 0.988       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0431      |
|    agent/train/n_updates             | 330         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 45.6         |
|    agent/rollout/ep_rew_wrapped_mean | -38.7        |
|    agent/time/fps                    | 3855         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 71680        |
|    agent/train/approx_kl             | 0.0037410269 |
|    agent/train/clip_fraction         | 0.182        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.551       |
|    agent/train/explained_variance    | 0.982        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00856      |
|    agent/train/n_updates             | 340          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 46.4         |
|    agent/rollout/ep_rew_wrapped_mean | -39.5        |
|    agent/time/fps                    | 3923         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 73728        |
|    agent/train/approx_kl             | 0.0030952157 |
|    agent/train/clip_fraction         | 0.149        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.543       |
|    agent/train/explained_variance    | 0.911        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0503       |
|    agent/train/n_updates             | 350          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 46.4         |
|    agent/rollout/ep_rew_wrapped_mean | -40.6        |
|    agent/time/fps                    | 4060         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 75776        |
|    agent/train/approx_kl             | 0.0035844843 |
|    agent/train/clip_fraction         | 0.163        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.534       |
|    agent/train/explained_variance    | 0.984        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0173       |
|    agent/train/n_updates             | 360          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 47.2         |
|    agent/rollout/ep_rew_wrapped_mean | -41.9        |
|    agent/time/fps                    | 3736         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 77824        |
|    agent/train/approx_kl             | 0.0033470131 |
|    agent/train/clip_fraction         | 0.166        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.532       |
|    agent/train/explained_variance    | 0.956        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0218       |
|    agent/train/n_updates             | 370          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 47.1        |
|    agent/rollout/ep_rew_wrapped_mean | -42         |
|    agent/time/fps                    | 3702        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 79872       |
|    agent/train/approx_kl             | 0.002886441 |
|    agent/train/clip_fraction         | 0.146       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.554      |
|    agent/train/explained_variance    | 0.919       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0497      |
|    agent/train/n_updates             | 380         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 47           |
|    agent/rollout/ep_rew_wrapped_mean | -42.6        |
|    agent/time/fps                    | 4294         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 81920        |
|    agent/train/approx_kl             | 0.0038640937 |
|    agent/train/clip_fraction         | 0.156        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.512       |
|    agent/train/explained_variance    | 0.958        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0362       |
|    agent/train/n_updates             | 390          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 47.8         |
|    agent/rollout/ep_rew_wrapped_mean | -44.1        |
|    agent/time/fps                    | 4130         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 83968        |
|    agent/train/approx_kl             | 0.0058647143 |
|    agent/train/clip_fraction         | 0.219        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.506       |
|    agent/train/explained_variance    | 0.983        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.013        |
|    agent/train/n_updates             | 400          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 47.8         |
|    agent/rollout/ep_rew_wrapped_mean | -45.2        |
|    agent/time/fps                    | 3215         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 86016        |
|    agent/train/approx_kl             | 0.0062113106 |
|    agent/train/clip_fraction         | 0.24         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.525       |
|    agent/train/explained_variance    | 0.987        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0334       |
|    agent/train/n_updates             | 410          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 49.5         |
|    agent/rollout/ep_rew_wrapped_mean | -46.6        |
|    agent/time/fps                    | 3785         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 88064        |
|    agent/train/approx_kl             | 0.0036550933 |
|    agent/train/clip_fraction         | 0.235        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.546       |
|    agent/train/explained_variance    | 0.971        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0554       |
|    agent/train/n_updates             | 420          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 50.1        |
|    agent/rollout/ep_rew_wrapped_mean | -47.1       |
|    agent/time/fps                    | 3660        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 90112       |
|    agent/train/approx_kl             | 0.002561654 |
|    agent/train/clip_fraction         | 0.13        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.525      |
|    agent/train/explained_variance    | 0.955       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0546      |
|    agent/train/n_updates             | 430         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 51.3         |
|    agent/rollout/ep_rew_wrapped_mean | -49.3        |
|    agent/time/fps                    | 4197         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 92160        |
|    agent/train/approx_kl             | 0.0045723952 |
|    agent/train/clip_fraction         | 0.195        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.546       |
|    agent/train/explained_variance    | 0.955        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0534       |
|    agent/train/n_updates             | 440          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 51.8         |
|    agent/rollout/ep_rew_wrapped_mean | -49.9        |
|    agent/time/fps                    | 3822         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 94208        |
|    agent/train/approx_kl             | 0.0037321942 |
|    agent/train/clip_fraction         | 0.162        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.521       |
|    agent/train/explained_variance    | 0.972        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.027        |
|    agent/train/n_updates             | 450          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 52.8        |
|    agent/rollout/ep_rew_wrapped_mean | -52.2       |
|    agent/time/fps                    | 4202        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 96256       |
|    agent/train/approx_kl             | 0.005085963 |
|    agent/train/clip_fraction         | 0.172       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.526      |
|    agent/train/explained_variance    | 0.974       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0589      |
|    agent/train/n_updates             | 460         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 53.8         |
|    agent/rollout/ep_rew_wrapped_mean | -54.1        |
|    agent/time/fps                    | 4041         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 98304        |
|    agent/train/approx_kl             | 0.0026638736 |
|    agent/train/clip_fraction         | 0.154        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.536       |
|    agent/train/explained_variance    | 0.967        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.103        |
|    agent/train/n_updates             | 470          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 55.5         |
|    agent/rollout/ep_rew_wrapped_mean | -56.5        |
|    agent/time/fps                    | 3051         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 100352       |
|    agent/train/approx_kl             | 0.0041466095 |
|    agent/train/clip_fraction         | 0.19         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.513       |
|    agent/train/explained_variance    | 0.973        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.112        |
|    agent/train/n_updates             | 480          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 55.5         |
|    agent/rollout/ep_rew_wrapped_mean | -59.6        |
|    agent/time/fps                    | 3771         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 102400       |
|    agent/train/approx_kl             | 0.0034578606 |
|    agent/train/clip_fraction         | 0.189        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.534       |
|    agent/train/explained_variance    | 0.96         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.152        |
|    agent/train/n_updates             | 490          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 57.1         |
|    agent/rollout/ep_rew_wrapped_mean | -61.4        |
|    agent/time/fps                    | 3858         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 104448       |
|    agent/train/approx_kl             | 0.0042078393 |
|    agent/train/clip_fraction         | 0.153        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.525       |
|    agent/train/explained_variance    | 0.959        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.157        |
|    agent/train/n_updates             | 500          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 58           |
|    agent/rollout/ep_rew_wrapped_mean | -63          |
|    agent/time/fps                    | 3718         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 106496       |
|    agent/train/approx_kl             | 0.0032151416 |
|    agent/train/clip_fraction         | 0.171        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.52        |
|    agent/train/explained_variance    | 0.983        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.635        |
|    agent/train/n_updates             | 510          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 59.6        |
|    agent/rollout/ep_rew_wrapped_mean | -63.8       |
|    agent/time/fps                    | 3634        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 108544      |
|    agent/train/approx_kl             | 0.006430491 |
|    agent/train/clip_fraction         | 0.22        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.501      |
|    agent/train/explained_variance    | 0.923       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.153       |
|    agent/train/n_updates             | 520         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 60.6        |
|    agent/rollout/ep_rew_wrapped_mean | -64.5       |
|    agent/time/fps                    | 3286        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 110592      |
|    agent/train/approx_kl             | 0.003774513 |
|    agent/train/clip_fraction         | 0.143       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.5        |
|    agent/train/explained_variance    | 0.974       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0644      |
|    agent/train/n_updates             | 530         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 60.8        |
|    agent/rollout/ep_rew_wrapped_mean | -67.3       |
|    agent/time/fps                    | 4375        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 112640      |
|    agent/train/approx_kl             | 0.003550642 |
|    agent/train/clip_fraction         | 0.143       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.477      |
|    agent/train/explained_variance    | 0.926       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0904      |
|    agent/train/n_updates             | 540         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 60.9        |
|    agent/rollout/ep_rew_wrapped_mean | -70.4       |
|    agent/time/fps                    | 3808        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 114688      |
|    agent/train/approx_kl             | 0.003417941 |
|    agent/train/clip_fraction         | 0.167       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.484      |
|    agent/train/explained_variance    | 0.97        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0679      |
|    agent/train/n_updates             | 550         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 61.9        |
|    agent/rollout/ep_rew_wrapped_mean | -74.7       |
|    agent/time/fps                    | 4266        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 116736      |
|    agent/train/approx_kl             | 0.002785585 |
|    agent/train/clip_fraction         | 0.144       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.487      |
|    agent/train/explained_variance    | 0.961       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.332       |
|    agent/train/n_updates             | 560         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 62.3         |
|    agent/rollout/ep_rew_wrapped_mean | -76          |
|    agent/time/fps                    | 3897         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 118784       |
|    agent/train/approx_kl             | 0.0033883087 |
|    agent/train/clip_fraction         | 0.132        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.466       |
|    agent/train/explained_variance    | 0.97         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0709       |
|    agent/train/n_updates             | 570          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 63.4        |
|    agent/rollout/ep_rew_wrapped_mean | -76.1       |
|    agent/time/fps                    | 3173        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 120832      |
|    agent/train/approx_kl             | 0.003692343 |
|    agent/train/clip_fraction         | 0.144       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.456      |
|    agent/train/explained_variance    | 0.939       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0629      |
|    agent/train/n_updates             | 580         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 103 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 63.9        |
|    agent/rollout/ep_rew_wrapped_mean | -77.5       |
|    agent/time/fps                    | 4005        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 122880      |
|    agent/train/approx_kl             | 0.003595495 |
|    agent/train/clip_fraction         | 0.178       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.532      |
|    agent/train/explained_variance    | 0.957       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.105       |
|    agent/train/n_updates             | 590         |
|    agent/train/policy_gradient

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 64.8         |
|    agent/rollout/ep_rew_wrapped_mean | -79.1        |
|    agent/time/fps                    | 3928         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 124928       |
|    agent/train/approx_kl             | 0.0049081426 |
|    agent/train/clip_fraction         | 0.198        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.499       |
|    agent/train/explained_variance    | 0.986        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0683       |
|    agent/train/n_updates             | 600          |
|    agent/train



VBox(children=(Label(value='0.112 MB of 0.112 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▁▁▂▂▃▄▅▇████████████████████████████████
time/fps,█▂▂▂▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
train/approx_kl,▁▃▂▄▄▅▂▂█▂▅▁▃▁▁▁▃▂▂▁▂▂▃▁▁▁▂▂▃▃▂▂▃▂▂▁▄▃▁▁
train/clip_fraction,▃▇▄▇▇█▄▆▃▅▃▄▃▃▄▃▄▄▄▃▃▃▃▂▂▂▄▃▃▂▃▃▂▂▁▁▃▃▁▁
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▃▃▃▄▅▆▆▆▆▆▇▆▆▇▆▆▆▇▇▆▇▇▇▇▇▇▇▇▇▇▇▇██▇███
train/explained_variance,▁▇██████████████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,500.0
time/fps,2866.0
train/approx_kl,0.00159
train/clip_fraction,0.04639
train/clip_range,0.1
train/entropy_loss,-0.1004
train/explained_variance,0.99767
train/learning_rate,0.002


 80%|███████████████████████████████████▏        | 4/5 [21:49<05:31, 331.01s/it]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011231081943131155, max=1.0…

Query schedule: [30, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3]
Collecting 60 fragments (12000 transitions)
Requested 7200 transitions but only 0 in buffer. Sampling 7200 additional transitions.
Sampling 4800 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 30 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 33.2     |
|    agent/rollout/ep_rew_wrapped_mean | -17.8    |
|    agent/time/fps                    | 4430     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 33.2     |
|    agent/rollout/ep_rew_wrapped_mean | -17.8    |
|    agent/time/fps                    | 4.43e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 27          |
|    agent/rollout/ep_rew_wrapped_mean | -24.4       |
|    agent/time/fps                    | 4461        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 4096        |
|    agent/train/approx_kl             | 0.003979719 |
|    agent/train/clip_fraction         | 0.142       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.69       |
|    agent/train/explained_variance    | 0.236       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00323     |
|    agent/train/n_updates             | 10          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.2         |
|    agent/rollout/ep_rew_wrapped_mean | -33          |
|    agent/time/fps                    | 4381         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0008432453 |
|    agent/train/clip_fraction         | 0.00981      |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.686       |
|    agent/train/explained_variance    | -0.717       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0212       |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.6         |
|    agent/rollout/ep_rew_wrapped_mean | -39.4        |
|    agent/time/fps                    | 3614         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0019575744 |
|    agent/train/clip_fraction         | 0.0462       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.687       |
|    agent/train/explained_variance    | 0.431        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0262       |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.2         |
|    agent/rollout/ep_rew_wrapped_mean | -45.6        |
|    agent/time/fps                    | 4237         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0037825634 |
|    agent/train/clip_fraction         | 0.2          |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.679       |
|    agent/train/explained_variance    | 0.961        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0125       |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.3         |
|    agent/rollout/ep_rew_wrapped_mean | -46.8        |
|    agent/time/fps                    | 3985         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0030711633 |
|    agent/train/clip_fraction         | 0.202        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.681       |
|    agent/train/explained_variance    | 0.858        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00774      |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.1         |
|    agent/rollout/ep_rew_wrapped_mean | -47.4        |
|    agent/time/fps                    | 4439         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0030269923 |
|    agent/train/clip_fraction         | 0.161        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.687       |
|    agent/train/explained_variance    | 0.952        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00713     |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.8         |
|    agent/rollout/ep_rew_wrapped_mean | -46.5        |
|    agent/time/fps                    | 4388         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0010525031 |
|    agent/train/clip_fraction         | 0.0303       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.687       |
|    agent/train/explained_variance    | 0.549        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00786      |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.9         |
|    agent/rollout/ep_rew_wrapped_mean | -45.7        |
|    agent/time/fps                    | 4334         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0027409692 |
|    agent/train/clip_fraction         | 0.141        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.687       |
|    agent/train/explained_variance    | 0.887        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0133       |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 25.6        |
|    agent/rollout/ep_rew_wrapped_mean | -44.8       |
|    agent/time/fps                    | 3222        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 20480       |
|    agent/train/approx_kl             | 0.003086951 |
|    agent/train/clip_fraction         | 0.195       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.682      |
|    agent/train/explained_variance    | 0.93        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00766     |
|    agent/train/n_updates             | 90          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.8         |
|    agent/rollout/ep_rew_wrapped_mean | -43.7        |
|    agent/time/fps                    | 4285         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0030990378 |
|    agent/train/clip_fraction         | 0.217        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.68        |
|    agent/train/explained_variance    | 0.948        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00618     |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.8         |
|    agent/rollout/ep_rew_wrapped_mean | -41.9        |
|    agent/time/fps                    | 4062         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0042999475 |
|    agent/train/clip_fraction         | 0.283        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.671       |
|    agent/train/explained_variance    | 0.909        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00137     |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.1         |
|    agent/rollout/ep_rew_wrapped_mean | -40.8        |
|    agent/time/fps                    | 3270         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 26624        |
|    agent/train/approx_kl             | 0.0036476513 |
|    agent/train/clip_fraction         | 0.21         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.672       |
|    agent/train/explained_variance    | 0.898        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0115      |
|    agent/train/n_updates             | 120          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.6         |
|    agent/rollout/ep_rew_wrapped_mean | -43.2        |
|    agent/time/fps                    | 3414         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 28672        |
|    agent/train/approx_kl             | 0.0039406535 |
|    agent/train/clip_fraction         | 0.178        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.655       |
|    agent/train/explained_variance    | 0.949        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00626      |
|    agent/train/n_updates             | 130          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 39           |
|    agent/rollout/ep_rew_wrapped_mean | -41          |
|    agent/time/fps                    | 3579         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0036468164 |
|    agent/train/clip_fraction         | 0.16         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.648       |
|    agent/train/explained_variance    | 0.949        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00953      |
|    agent/train/n_updates             | 140          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 40.8         |
|    agent/rollout/ep_rew_wrapped_mean | -38.4        |
|    agent/time/fps                    | 3350         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0034040213 |
|    agent/train/clip_fraction         | 0.178        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.637       |
|    agent/train/explained_variance    | 0.946        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0117       |
|    agent/train/n_updates             | 150          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 44.2         |
|    agent/rollout/ep_rew_wrapped_mean | -40.2        |
|    agent/time/fps                    | 3556         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0036883042 |
|    agent/train/clip_fraction         | 0.155        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.632       |
|    agent/train/explained_variance    | 0.968        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0299       |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 51.5         |
|    agent/rollout/ep_rew_wrapped_mean | -39.4        |
|    agent/time/fps                    | 3054         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 36864        |
|    agent/train/approx_kl             | 0.0031708009 |
|    agent/train/clip_fraction         | 0.159        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.625       |
|    agent/train/explained_variance    | 0.977        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.000633     |
|    agent/train/n_updates             | 170          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 57.7        |
|    agent/rollout/ep_rew_wrapped_mean | -36.1       |
|    agent/time/fps                    | 3420        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 38912       |
|    agent/train/approx_kl             | 0.004649045 |
|    agent/train/clip_fraction         | 0.2         |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.606      |
|    agent/train/explained_variance    | 0.976       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0108     |
|    agent/train/n_updates             | 180         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 66.4         |
|    agent/rollout/ep_rew_wrapped_mean | -31.1        |
|    agent/time/fps                    | 3381         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 40960        |
|    agent/train/approx_kl             | 0.0048182746 |
|    agent/train/clip_fraction         | 0.186        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.574       |
|    agent/train/explained_variance    | 0.933        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0128       |
|    agent/train/n_updates             | 190          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 75.1         |
|    agent/rollout/ep_rew_wrapped_mean | -25.7        |
|    agent/time/fps                    | 3320         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0035950155 |
|    agent/train/clip_fraction         | 0.21         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.581       |
|    agent/train/explained_variance    | 0.965        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0246      |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 83.3         |
|    agent/rollout/ep_rew_wrapped_mean | -20.3        |
|    agent/time/fps                    | 3696         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 45056        |
|    agent/train/approx_kl             | 0.0037000633 |
|    agent/train/clip_fraction         | 0.195        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.576       |
|    agent/train/explained_variance    | 0.965        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00687      |
|    agent/train/n_updates             | 210          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 89.3         |
|    agent/rollout/ep_rew_wrapped_mean | -15.7        |
|    agent/time/fps                    | 3670         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 47104        |
|    agent/train/approx_kl             | 0.0039012795 |
|    agent/train/clip_fraction         | 0.24         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.579       |
|    agent/train/explained_variance    | 0.971        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00904      |
|    agent/train/n_updates             | 220          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 93.6         |
|    agent/rollout/ep_rew_wrapped_mean | -10.2        |
|    agent/time/fps                    | 3446         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 49152        |
|    agent/train/approx_kl             | 0.0027588047 |
|    agent/train/clip_fraction         | 0.179        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.568       |
|    agent/train/explained_variance    | 0.967        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00139      |
|    agent/train/n_updates             | 230          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 99.9         |
|    agent/rollout/ep_rew_wrapped_mean | -6           |
|    agent/time/fps                    | 3219         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 51200        |
|    agent/train/approx_kl             | 0.0032998323 |
|    agent/train/clip_fraction         | 0.184        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.548       |
|    agent/train/explained_variance    | 0.975        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0238      |
|    agent/train/n_updates             | 240          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 108          |
|    agent/rollout/ep_rew_wrapped_mean | -1.62        |
|    agent/time/fps                    | 3715         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 53248        |
|    agent/train/approx_kl             | 0.0031123527 |
|    agent/train/clip_fraction         | 0.167        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.544       |
|    agent/train/explained_variance    | 0.968        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00426     |
|    agent/train/n_updates             | 250          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 115          |
|    agent/rollout/ep_rew_wrapped_mean | 2.56         |
|    agent/time/fps                    | 3708         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 55296        |
|    agent/train/approx_kl             | 0.0050635682 |
|    agent/train/clip_fraction         | 0.205        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.527       |
|    agent/train/explained_variance    | 0.971        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0107      |
|    agent/train/n_updates             | 260          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 122         |
|    agent/rollout/ep_rew_wrapped_mean | 7.24        |
|    agent/time/fps                    | 3078        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 57344       |
|    agent/train/approx_kl             | 0.001966838 |
|    agent/train/clip_fraction         | 0.122       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.52       |
|    agent/train/explained_variance    | 0.96        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0036     |
|    agent/train/n_updates             | 270         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 128          |
|    agent/rollout/ep_rew_wrapped_mean | 11.4         |
|    agent/time/fps                    | 4248         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 59392        |
|    agent/train/approx_kl             | 0.0026684143 |
|    agent/train/clip_fraction         | 0.163        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.512       |
|    agent/train/explained_variance    | 0.987        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.000263     |
|    agent/train/n_updates             | 280          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 134          |
|    agent/rollout/ep_rew_wrapped_mean | 15.3         |
|    agent/time/fps                    | 2985         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 61440        |
|    agent/train/approx_kl             | 0.0027323184 |
|    agent/train/clip_fraction         | 0.148        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.492       |
|    agent/train/explained_variance    | 0.994        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0103      |
|    agent/train/n_updates             | 290          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 140         |
|    agent/rollout/ep_rew_wrapped_mean | 18.7        |
|    agent/time/fps                    | 3268        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 63488       |
|    agent/train/approx_kl             | 0.001555285 |
|    agent/train/clip_fraction         | 0.129       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.499      |
|    agent/train/explained_variance    | 0.988       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0117     |
|    agent/train/n_updates             | 300         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 146          |
|    agent/rollout/ep_rew_wrapped_mean | 22.1         |
|    agent/time/fps                    | 3534         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 65536        |
|    agent/train/approx_kl             | 0.0021338346 |
|    agent/train/clip_fraction         | 0.11         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.499       |
|    agent/train/explained_variance    | 0.991        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00082     |
|    agent/train/n_updates             | 310          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 152          |
|    agent/rollout/ep_rew_wrapped_mean | 24.5         |
|    agent/time/fps                    | 3911         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 67584        |
|    agent/train/approx_kl             | 0.0023535462 |
|    agent/train/clip_fraction         | 0.132        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.488       |
|    agent/train/explained_variance    | 0.994        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0031      |
|    agent/train/n_updates             | 320          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 158          |
|    agent/rollout/ep_rew_wrapped_mean | 27.5         |
|    agent/time/fps                    | 3554         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 69632        |
|    agent/train/approx_kl             | 0.0024430384 |
|    agent/train/clip_fraction         | 0.106        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.489       |
|    agent/train/explained_variance    | 0.992        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00137     |
|    agent/train/n_updates             | 330          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 164          |
|    agent/rollout/ep_rew_wrapped_mean | 30.2         |
|    agent/time/fps                    | 1633         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 1            |
|    agent/time/total_timesteps        | 71680        |
|    agent/train/approx_kl             | 0.0021804883 |
|    agent/train/clip_fraction         | 0.102        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.482       |
|    agent/train/explained_variance    | 0.996        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00476     |
|    agent/train/n_updates             | 340          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 167          |
|    agent/rollout/ep_rew_wrapped_mean | 33.1         |
|    agent/time/fps                    | 1519         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 1            |
|    agent/time/total_timesteps        | 73728        |
|    agent/train/approx_kl             | 0.0035285456 |
|    agent/train/clip_fraction         | 0.127        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.49        |
|    agent/train/explained_variance    | 0.985        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0148       |
|    agent/train/n_updates             | 350          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 172          |
|    agent/rollout/ep_rew_wrapped_mean | 35.8         |
|    agent/time/fps                    | 2803         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 75776        |
|    agent/train/approx_kl             | 0.0012082792 |
|    agent/train/clip_fraction         | 0.0959       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.478       |
|    agent/train/explained_variance    | 0.991        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00677     |
|    agent/train/n_updates             | 360          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 178          |
|    agent/rollout/ep_rew_wrapped_mean | 42.6         |
|    agent/time/fps                    | 3500         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 77824        |
|    agent/train/approx_kl             | 0.0021041234 |
|    agent/train/clip_fraction         | 0.0955       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.493       |
|    agent/train/explained_variance    | 0.934        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0429       |
|    agent/train/n_updates             | 370          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 183          |
|    agent/rollout/ep_rew_wrapped_mean | 55.4         |
|    agent/time/fps                    | 3251         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 79872        |
|    agent/train/approx_kl             | 0.0014866202 |
|    agent/train/clip_fraction         | 0.0821       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.497       |
|    agent/train/explained_variance    | 0.879        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0586       |
|    agent/train/n_updates             | 380          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 185          |
|    agent/rollout/ep_rew_wrapped_mean | 63.6         |
|    agent/time/fps                    | 3745         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 81920        |
|    agent/train/approx_kl             | 0.0016955012 |
|    agent/train/clip_fraction         | 0.104        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.482       |
|    agent/train/explained_variance    | 0.921        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00557      |
|    agent/train/n_updates             | 390          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 190          |
|    agent/rollout/ep_rew_wrapped_mean | 75.9         |
|    agent/time/fps                    | 4386         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 83968        |
|    agent/train/approx_kl             | 0.0018950899 |
|    agent/train/clip_fraction         | 0.0928       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.493       |
|    agent/train/explained_variance    | 0.932        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00612      |
|    agent/train/n_updates             | 400          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 193         |
|    agent/rollout/ep_rew_wrapped_mean | 87.9        |
|    agent/time/fps                    | 4127        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 86016       |
|    agent/train/approx_kl             | 0.001546148 |
|    agent/train/clip_fraction         | 0.0875      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.492      |
|    agent/train/explained_variance    | 0.907       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0563      |
|    agent/train/n_updates             | 410         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 194         |
|    agent/rollout/ep_rew_wrapped_mean | 96.4        |
|    agent/time/fps                    | 3940        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 88064       |
|    agent/train/approx_kl             | 0.002468525 |
|    agent/train/clip_fraction         | 0.0903      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.49       |
|    agent/train/explained_variance    | 0.929       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0184      |
|    agent/train/n_updates             | 420         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 194          |
|    agent/rollout/ep_rew_wrapped_mean | 103          |
|    agent/time/fps                    | 4019         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 90112        |
|    agent/train/approx_kl             | 0.0015380571 |
|    agent/train/clip_fraction         | 0.0796       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.471       |
|    agent/train/explained_variance    | 0.957        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.19         |
|    agent/train/n_updates             | 430          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 192          |
|    agent/rollout/ep_rew_wrapped_mean | 110          |
|    agent/time/fps                    | 4333         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 92160        |
|    agent/train/approx_kl             | 0.0039804955 |
|    agent/train/clip_fraction         | 0.116        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.486       |
|    agent/train/explained_variance    | 0.981        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0432       |
|    agent/train/n_updates             | 440          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 189          |
|    agent/rollout/ep_rew_wrapped_mean | 117          |
|    agent/time/fps                    | 3996         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 94208        |
|    agent/train/approx_kl             | 0.0036449218 |
|    agent/train/clip_fraction         | 0.116        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.494       |
|    agent/train/explained_variance    | 0.99         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.466        |
|    agent/train/n_updates             | 450          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 188          |
|    agent/rollout/ep_rew_wrapped_mean | 125          |
|    agent/time/fps                    | 4020         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 96256        |
|    agent/train/approx_kl             | 0.0038753941 |
|    agent/train/clip_fraction         | 0.174        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.524       |
|    agent/train/explained_variance    | 0.996        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0295       |
|    agent/train/n_updates             | 460          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 186          |
|    agent/rollout/ep_rew_wrapped_mean | 138          |
|    agent/time/fps                    | 3950         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 98304        |
|    agent/train/approx_kl             | 0.0039963205 |
|    agent/train/clip_fraction         | 0.141        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.521       |
|    agent/train/explained_variance    | 0.976        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0305       |
|    agent/train/n_updates             | 470          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 187          |
|    agent/rollout/ep_rew_wrapped_mean | 147          |
|    agent/time/fps                    | 3943         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 100352       |
|    agent/train/approx_kl             | 0.0042963456 |
|    agent/train/clip_fraction         | 0.161        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.537       |
|    agent/train/explained_variance    | 0.995        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0579       |
|    agent/train/n_updates             | 480          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 193          |
|    agent/rollout/ep_rew_wrapped_mean | 156          |
|    agent/time/fps                    | 3909         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 102400       |
|    agent/train/approx_kl             | 0.0036697683 |
|    agent/train/clip_fraction         | 0.149        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.536       |
|    agent/train/explained_variance    | 0.993        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0482       |
|    agent/train/n_updates             | 490          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 197          |
|    agent/rollout/ep_rew_wrapped_mean | 169          |
|    agent/time/fps                    | 3884         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 104448       |
|    agent/train/approx_kl             | 0.0023391165 |
|    agent/train/clip_fraction         | 0.138        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.529       |
|    agent/train/explained_variance    | 0.992        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.104        |
|    agent/train/n_updates             | 500          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 209          |
|    agent/rollout/ep_rew_wrapped_mean | 184          |
|    agent/time/fps                    | 2736         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 106496       |
|    agent/train/approx_kl             | 0.0047924956 |
|    agent/train/clip_fraction         | 0.165        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.538       |
|    agent/train/explained_variance    | 0.998        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0306       |
|    agent/train/n_updates             | 510          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 221          |
|    agent/rollout/ep_rew_wrapped_mean | 200          |
|    agent/time/fps                    | 3278         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 108544       |
|    agent/train/approx_kl             | 0.0055760737 |
|    agent/train/clip_fraction         | 0.179        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.52        |
|    agent/train/explained_variance    | 0.997        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.068        |
|    agent/train/n_updates             | 520          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 234          |
|    agent/rollout/ep_rew_wrapped_mean | 213          |
|    agent/time/fps                    | 3585         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 110592       |
|    agent/train/approx_kl             | 0.0051489156 |
|    agent/train/clip_fraction         | 0.176        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.519       |
|    agent/train/explained_variance    | 0.998        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00989      |
|    agent/train/n_updates             | 530          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 247          |
|    agent/rollout/ep_rew_wrapped_mean | 222          |
|    agent/time/fps                    | 3491         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 112640       |
|    agent/train/approx_kl             | 0.0035846885 |
|    agent/train/clip_fraction         | 0.219        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.511       |
|    agent/train/explained_variance    | 0.999        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00681     |
|    agent/train/n_updates             | 540          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 261         |
|    agent/rollout/ep_rew_wrapped_mean | 229         |
|    agent/time/fps                    | 3693        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 114688      |
|    agent/train/approx_kl             | 0.003776436 |
|    agent/train/clip_fraction         | 0.129       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.515      |
|    agent/train/explained_variance    | 0.998       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00722     |
|    agent/train/n_updates             | 550         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 273          |
|    agent/rollout/ep_rew_wrapped_mean | 238          |
|    agent/time/fps                    | 3326         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 116736       |
|    agent/train/approx_kl             | 0.0059362836 |
|    agent/train/clip_fraction         | 0.157        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.517       |
|    agent/train/explained_variance    | 0.998        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0037      |
|    agent/train/n_updates             | 560          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 287          |
|    agent/rollout/ep_rew_wrapped_mean | 246          |
|    agent/time/fps                    | 3864         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 118784       |
|    agent/train/approx_kl             | 0.0062798373 |
|    agent/train/clip_fraction         | 0.224        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.5         |
|    agent/train/explained_variance    | 0.999        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.000332     |
|    agent/train/n_updates             | 570          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 299          |
|    agent/rollout/ep_rew_wrapped_mean | 255          |
|    agent/time/fps                    | 3523         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 120832       |
|    agent/train/approx_kl             | 0.0036034924 |
|    agent/train/clip_fraction         | 0.19         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.49        |
|    agent/train/explained_variance    | 0.998        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0214       |
|    agent/train/n_updates             | 580          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 103 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 312          |
|    agent/rollout/ep_rew_wrapped_mean | 269          |
|    agent/time/fps                    | 3620         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 122880       |
|    agent/train/approx_kl             | 0.0025313506 |
|    agent/train/clip_fraction         | 0.115        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.494       |
|    agent/train/explained_variance    | 0.98         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0626       |
|    agent/train/n_updates             | 590          |
|    agent/trai

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 326          |
|    agent/rollout/ep_rew_wrapped_mean | 286          |
|    agent/time/fps                    | 3361         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 124928       |
|    agent/train/approx_kl             | 0.0016583232 |
|    agent/train/clip_fraction         | 0.118        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.504       |
|    agent/train/explained_variance    | 0.953        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0152       |
|    agent/train/n_updates             | 600          |
|    agent/train



VBox(children=(Label(value='0.111 MB of 0.111 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▁▁▁▂▃▅▇█████████████████████████████████
time/fps,▁▄▆▆▇▇▆▇▇▇▇▇▇███████████████████████████
train/approx_kl,▂▂▃▃▃▃▂▁▂▁▁▃▁▂▂▃▂▃▃▂▁▁▃▇▂▂▂█▃▄▁▂▂▂▂▂▃▂▃▂
train/clip_fraction,▄▆█▇▇▆▅▅▄▅▄▄▃▄▆▃▃▅▄▃▂▂▅▃▃▃▂▄▂▃▂▂▁▂▁▁▂▁▂▁
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▃▂▄▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████▇█▇███████▇█
train/explained_variance,▁▇██▇▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▇▇▇▇▇▇▇▇▇▇▇█▇▇▇▇
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,500.0
time/fps,2849.0
train/approx_kl,0.00305
train/clip_fraction,0.06929
train/clip_range,0.1
train/entropy_loss,-0.15345
train/explained_variance,0.93539
train/learning_rate,0.002


100%|████████████████████████████████████████████| 5/5 [27:40<00:00, 332.15s/it]
  0%|                                                     | 0/5 [00:00<?, ?it/s]

Query schedule: [50, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5]
Collecting 100 fragments (20000 transitions)
Requested 12000 transitions but only 0 in buffer. Sampling 12000 additional transitions.
Sampling 8000 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 50 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 29.5     |
|    agent/rollout/ep_rew_wrapped_mean | 80       |
|    agent/time/fps                    | 4163     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 29.5     |
|    agent/rollout/ep_rew_wrapped_mean | 80       |
|    agent/time/fps                    | 4.16e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.6         |
|    agent/rollout/ep_rew_wrapped_mean | 59.9         |
|    agent/time/fps                    | 4208         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0027735783 |
|    agent/train/clip_fraction         | 0.126        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.691       |
|    agent/train/explained_variance    | 0.0838       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.000484     |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.2         |
|    agent/rollout/ep_rew_wrapped_mean | 46.3         |
|    agent/time/fps                    | 4481         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0030113123 |
|    agent/train/clip_fraction         | 0.128        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.684       |
|    agent/train/explained_variance    | 0.54         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0164      |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.2         |
|    agent/rollout/ep_rew_wrapped_mean | 36.9         |
|    agent/time/fps                    | 3914         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0028978407 |
|    agent/train/clip_fraction         | 0.117        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.678       |
|    agent/train/explained_variance    | 0.641        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0111      |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.9         |
|    agent/rollout/ep_rew_wrapped_mean | 29.4         |
|    agent/time/fps                    | 4275         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0022188553 |
|    agent/train/clip_fraction         | 0.0889       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.673       |
|    agent/train/explained_variance    | 0.813        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00935     |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.5         |
|    agent/rollout/ep_rew_wrapped_mean | 22           |
|    agent/time/fps                    | 4296         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0027833395 |
|    agent/train/clip_fraction         | 0.151        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.659       |
|    agent/train/explained_variance    | 0.826        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0108       |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.7         |
|    agent/rollout/ep_rew_wrapped_mean | 15.5         |
|    agent/time/fps                    | 4281         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0020004523 |
|    agent/train/clip_fraction         | 0.0774       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.649       |
|    agent/train/explained_variance    | 0.926        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00233     |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.3         |
|    agent/rollout/ep_rew_wrapped_mean | 7.07         |
|    agent/time/fps                    | 4285         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0026465696 |
|    agent/train/clip_fraction         | 0.0855       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.651       |
|    agent/train/explained_variance    | 0.952        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.006        |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.2         |
|    agent/rollout/ep_rew_wrapped_mean | -3.59        |
|    agent/time/fps                    | 4417         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0024315924 |
|    agent/train/clip_fraction         | 0.119        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.654       |
|    agent/train/explained_variance    | 0.954        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0116       |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 32.9        |
|    agent/rollout/ep_rew_wrapped_mean | -10.1       |
|    agent/time/fps                    | 4126        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 20480       |
|    agent/train/approx_kl             | 0.004064129 |
|    agent/train/clip_fraction         | 0.173       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.651      |
|    agent/train/explained_variance    | 0.964       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00389    |
|    agent/train/n_updates             | 90          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.3         |
|    agent/rollout/ep_rew_wrapped_mean | -23.4        |
|    agent/time/fps                    | 3516         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0031632376 |
|    agent/train/clip_fraction         | 0.15         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.64        |
|    agent/train/explained_variance    | 0.956        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00436     |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.6         |
|    agent/rollout/ep_rew_wrapped_mean | -36.2        |
|    agent/time/fps                    | 4090         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0042677084 |
|    agent/train/clip_fraction         | 0.233        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.651       |
|    agent/train/explained_variance    | 0.938        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0126       |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.2         |
|    agent/rollout/ep_rew_wrapped_mean | -40.6        |
|    agent/time/fps                    | 3940         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 26624        |
|    agent/train/approx_kl             | 0.0037989249 |
|    agent/train/clip_fraction         | 0.247        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.655       |
|    agent/train/explained_variance    | 0.951        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0408       |
|    agent/train/n_updates             | 120          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.9         |
|    agent/rollout/ep_rew_wrapped_mean | -44.6        |
|    agent/time/fps                    | 4267         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 28672        |
|    agent/train/approx_kl             | 0.0035331596 |
|    agent/train/clip_fraction         | 0.223        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.645       |
|    agent/train/explained_variance    | 0.962        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0198       |
|    agent/train/n_updates             | 130          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.2         |
|    agent/rollout/ep_rew_wrapped_mean | -46.3        |
|    agent/time/fps                    | 3979         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0039318292 |
|    agent/train/clip_fraction         | 0.233        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.65        |
|    agent/train/explained_variance    | 0.943        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0355       |
|    agent/train/n_updates             | 140          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.2         |
|    agent/rollout/ep_rew_wrapped_mean | -47.4        |
|    agent/time/fps                    | 4396         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0052592317 |
|    agent/train/clip_fraction         | 0.299        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.64        |
|    agent/train/explained_variance    | 0.911        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0829       |
|    agent/train/n_updates             | 150          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.8         |
|    agent/rollout/ep_rew_wrapped_mean | -49.8        |
|    agent/time/fps                    | 3920         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0050698332 |
|    agent/train/clip_fraction         | 0.228        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.63        |
|    agent/train/explained_variance    | 0.922        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0507       |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 33.9        |
|    agent/rollout/ep_rew_wrapped_mean | -49.9       |
|    agent/time/fps                    | 4308        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 36864       |
|    agent/train/approx_kl             | 0.003777009 |
|    agent/train/clip_fraction         | 0.174       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.629      |
|    agent/train/explained_variance    | 0.911       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0434      |
|    agent/train/n_updates             | 170         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 34.8        |
|    agent/rollout/ep_rew_wrapped_mean | -48.2       |
|    agent/time/fps                    | 4140        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 38912       |
|    agent/train/approx_kl             | 0.004805043 |
|    agent/train/clip_fraction         | 0.191       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.624      |
|    agent/train/explained_variance    | 0.948       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.12        |
|    agent/train/n_updates             | 180         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 36.3       |
|    agent/rollout/ep_rew_wrapped_mean | -45.9      |
|    agent/time/fps                    | 3839       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 40960      |
|    agent/train/approx_kl             | 0.00262872 |
|    agent/train/clip_fraction         | 0.118      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.63      |
|    agent/train/explained_variance    | 0.86       |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.0688     |
|    agent/train/n_updates             | 190        |
|    agent/train/policy_gradient_loss  | -0.00295 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 37.3         |
|    agent/rollout/ep_rew_wrapped_mean | -45          |
|    agent/time/fps                    | 4064         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0038036054 |
|    agent/train/clip_fraction         | 0.233        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.638       |
|    agent/train/explained_variance    | 0.96         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.105        |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 38.1         |
|    agent/rollout/ep_rew_wrapped_mean | -42.4        |
|    agent/time/fps                    | 4000         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 45056        |
|    agent/train/approx_kl             | 0.0027546035 |
|    agent/train/clip_fraction         | 0.14         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.6         |
|    agent/train/explained_variance    | 0.95         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0391       |
|    agent/train/n_updates             | 210          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 38.4         |
|    agent/rollout/ep_rew_wrapped_mean | -47.7        |
|    agent/time/fps                    | 4173         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 47104        |
|    agent/train/approx_kl             | 0.0025337953 |
|    agent/train/clip_fraction         | 0.125        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.611       |
|    agent/train/explained_variance    | 0.862        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0915       |
|    agent/train/n_updates             | 220          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 38.5        |
|    agent/rollout/ep_rew_wrapped_mean | -44.1       |
|    agent/time/fps                    | 3995        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 49152       |
|    agent/train/approx_kl             | 0.002479388 |
|    agent/train/clip_fraction         | 0.145       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.635      |
|    agent/train/explained_variance    | 0.955       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.051       |
|    agent/train/n_updates             | 230         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 41           |
|    agent/rollout/ep_rew_wrapped_mean | -42.5        |
|    agent/time/fps                    | 4046         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 51200        |
|    agent/train/approx_kl             | 0.0036799484 |
|    agent/train/clip_fraction         | 0.196        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.614       |
|    agent/train/explained_variance    | 0.988        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0718       |
|    agent/train/n_updates             | 240          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 44.8        |
|    agent/rollout/ep_rew_wrapped_mean | -36.4       |
|    agent/time/fps                    | 4041        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 53248       |
|    agent/train/approx_kl             | 0.004653048 |
|    agent/train/clip_fraction         | 0.183       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.606      |
|    agent/train/explained_variance    | 0.99        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.201       |
|    agent/train/n_updates             | 250         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 50.3         |
|    agent/rollout/ep_rew_wrapped_mean | -31.2        |
|    agent/time/fps                    | 3016         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 55296        |
|    agent/train/approx_kl             | 0.0027706753 |
|    agent/train/clip_fraction         | 0.141        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.624       |
|    agent/train/explained_variance    | 0.895        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.173        |
|    agent/train/n_updates             | 260          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 62.4         |
|    agent/rollout/ep_rew_wrapped_mean | -23.1        |
|    agent/time/fps                    | 2364         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 57344        |
|    agent/train/approx_kl             | 0.0026201485 |
|    agent/train/clip_fraction         | 0.14         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.616       |
|    agent/train/explained_variance    | 0.931        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.238        |
|    agent/train/n_updates             | 270          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 71           |
|    agent/rollout/ep_rew_wrapped_mean | -11.9        |
|    agent/time/fps                    | 4128         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 59392        |
|    agent/train/approx_kl             | 0.0034543327 |
|    agent/train/clip_fraction         | 0.149        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.587       |
|    agent/train/explained_variance    | 0.984        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.247        |
|    agent/train/n_updates             | 280          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 90          |
|    agent/rollout/ep_rew_wrapped_mean | -3.93       |
|    agent/time/fps                    | 4034        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 61440       |
|    agent/train/approx_kl             | 0.002642381 |
|    agent/train/clip_fraction         | 0.167       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.586      |
|    agent/train/explained_variance    | 0.965       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.233       |
|    agent/train/n_updates             | 290         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 103         |
|    agent/rollout/ep_rew_wrapped_mean | 10          |
|    agent/time/fps                    | 3834        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 63488       |
|    agent/train/approx_kl             | 0.008892133 |
|    agent/train/clip_fraction         | 0.206       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.569      |
|    agent/train/explained_variance    | 0.946       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0209      |
|    agent/train/n_updates             | 300         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 121         |
|    agent/rollout/ep_rew_wrapped_mean | 25.7        |
|    agent/time/fps                    | 3679        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 65536       |
|    agent/train/approx_kl             | 0.003703689 |
|    agent/train/clip_fraction         | 0.165       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.568      |
|    agent/train/explained_variance    | 0.98        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0103      |
|    agent/train/n_updates             | 310         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 139          |
|    agent/rollout/ep_rew_wrapped_mean | 46.2         |
|    agent/time/fps                    | 3403         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 67584        |
|    agent/train/approx_kl             | 0.0021094603 |
|    agent/train/clip_fraction         | 0.195        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.526       |
|    agent/train/explained_variance    | 0.903        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00417      |
|    agent/train/n_updates             | 320          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 157          |
|    agent/rollout/ep_rew_wrapped_mean | 59.7         |
|    agent/time/fps                    | 3937         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 69632        |
|    agent/train/approx_kl             | 0.0035896553 |
|    agent/train/clip_fraction         | 0.12         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.513       |
|    agent/train/explained_variance    | 0.981        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0116       |
|    agent/train/n_updates             | 330          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 176          |
|    agent/rollout/ep_rew_wrapped_mean | 76.2         |
|    agent/time/fps                    | 4055         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 71680        |
|    agent/train/approx_kl             | 0.0020499297 |
|    agent/train/clip_fraction         | 0.117        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.518       |
|    agent/train/explained_variance    | 0.944        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00226     |
|    agent/train/n_updates             | 340          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 193          |
|    agent/rollout/ep_rew_wrapped_mean | 90.5         |
|    agent/time/fps                    | 3901         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 73728        |
|    agent/train/approx_kl             | 0.0033507121 |
|    agent/train/clip_fraction         | 0.166        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.486       |
|    agent/train/explained_variance    | 0.99         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00733      |
|    agent/train/n_updates             | 350          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 211          |
|    agent/rollout/ep_rew_wrapped_mean | 111          |
|    agent/time/fps                    | 3935         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 75776        |
|    agent/train/approx_kl             | 0.0056895143 |
|    agent/train/clip_fraction         | 0.211        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.475       |
|    agent/train/explained_variance    | 0.996        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00678      |
|    agent/train/n_updates             | 360          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 230          |
|    agent/rollout/ep_rew_wrapped_mean | 128          |
|    agent/time/fps                    | 4057         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 77824        |
|    agent/train/approx_kl             | 0.0026791058 |
|    agent/train/clip_fraction         | 0.138        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.458       |
|    agent/train/explained_variance    | 0.997        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00786     |
|    agent/train/n_updates             | 370          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 248          |
|    agent/rollout/ep_rew_wrapped_mean | 143          |
|    agent/time/fps                    | 4155         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 79872        |
|    agent/train/approx_kl             | 0.0027572531 |
|    agent/train/clip_fraction         | 0.168        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.458       |
|    agent/train/explained_variance    | 0.953        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00876     |
|    agent/train/n_updates             | 380          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 266          |
|    agent/rollout/ep_rew_wrapped_mean | 160          |
|    agent/time/fps                    | 3692         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 81920        |
|    agent/train/approx_kl             | 0.0040001567 |
|    agent/train/clip_fraction         | 0.16         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.461       |
|    agent/train/explained_variance    | 0.987        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00288     |
|    agent/train/n_updates             | 390          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 282         |
|    agent/rollout/ep_rew_wrapped_mean | 177         |
|    agent/time/fps                    | 3771        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 83968       |
|    agent/train/approx_kl             | 0.003130761 |
|    agent/train/clip_fraction         | 0.141       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.478      |
|    agent/train/explained_variance    | 0.99        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00262     |
|    agent/train/n_updates             | 400         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 299         |
|    agent/rollout/ep_rew_wrapped_mean | 193         |
|    agent/time/fps                    | 4090        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 86016       |
|    agent/train/approx_kl             | 0.004286575 |
|    agent/train/clip_fraction         | 0.159       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.477      |
|    agent/train/explained_variance    | 0.995       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00992    |
|    agent/train/n_updates             | 410         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 310         |
|    agent/rollout/ep_rew_wrapped_mean | 202         |
|    agent/time/fps                    | 3990        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 88064       |
|    agent/train/approx_kl             | 0.004060454 |
|    agent/train/clip_fraction         | 0.155       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.439      |
|    agent/train/explained_variance    | 0.996       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0087      |
|    agent/train/n_updates             | 420         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 321          |
|    agent/rollout/ep_rew_wrapped_mean | 212          |
|    agent/time/fps                    | 3043         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 90112        |
|    agent/train/approx_kl             | 0.0020653321 |
|    agent/train/clip_fraction         | 0.127        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.454       |
|    agent/train/explained_variance    | 0.999        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0138       |
|    agent/train/n_updates             | 430          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 331          |
|    agent/rollout/ep_rew_wrapped_mean | 225          |
|    agent/time/fps                    | 4346         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 92160        |
|    agent/train/approx_kl             | 0.0032262322 |
|    agent/train/clip_fraction         | 0.148        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.456       |
|    agent/train/explained_variance    | 0.999        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0099      |
|    agent/train/n_updates             | 440          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 343          |
|    agent/rollout/ep_rew_wrapped_mean | 234          |
|    agent/time/fps                    | 3848         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 94208        |
|    agent/train/approx_kl             | 0.0052084727 |
|    agent/train/clip_fraction         | 0.209        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.438       |
|    agent/train/explained_variance    | 0.998        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00253      |
|    agent/train/n_updates             | 450          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 358          |
|    agent/rollout/ep_rew_wrapped_mean | 241          |
|    agent/time/fps                    | 4042         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 96256        |
|    agent/train/approx_kl             | 0.0031563116 |
|    agent/train/clip_fraction         | 0.145        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.423       |
|    agent/train/explained_variance    | 0.996        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00502     |
|    agent/train/n_updates             | 460          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 373          |
|    agent/rollout/ep_rew_wrapped_mean | 248          |
|    agent/time/fps                    | 3977         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 98304        |
|    agent/train/approx_kl             | 0.0043138387 |
|    agent/train/clip_fraction         | 0.128        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.426       |
|    agent/train/explained_variance    | 0.989        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0134      |
|    agent/train/n_updates             | 470          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 390          |
|    agent/rollout/ep_rew_wrapped_mean | 249          |
|    agent/time/fps                    | 4078         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 100352       |
|    agent/train/approx_kl             | 0.0021319143 |
|    agent/train/clip_fraction         | 0.135        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.4         |
|    agent/train/explained_variance    | 0.977        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00648      |
|    agent/train/n_updates             | 480          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 406          |
|    agent/rollout/ep_rew_wrapped_mean | 264          |
|    agent/time/fps                    | 4075         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 102400       |
|    agent/train/approx_kl             | 0.0022841152 |
|    agent/train/clip_fraction         | 0.15         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.393       |
|    agent/train/explained_variance    | 0.973        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0193       |
|    agent/train/n_updates             | 490          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 421          |
|    agent/rollout/ep_rew_wrapped_mean | 273          |
|    agent/time/fps                    | 3293         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 104448       |
|    agent/train/approx_kl             | 0.0034534638 |
|    agent/train/clip_fraction         | 0.112        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.391       |
|    agent/train/explained_variance    | 0.936        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0124       |
|    agent/train/n_updates             | 500          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 434          |
|    agent/rollout/ep_rew_wrapped_mean | 278          |
|    agent/time/fps                    | 3554         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 106496       |
|    agent/train/approx_kl             | 0.0024989457 |
|    agent/train/clip_fraction         | 0.115        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.383       |
|    agent/train/explained_variance    | 0.901        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00367     |
|    agent/train/n_updates             | 510          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 439          |
|    agent/rollout/ep_rew_wrapped_mean | 283          |
|    agent/time/fps                    | 4059         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 108544       |
|    agent/train/approx_kl             | 0.0014665995 |
|    agent/train/clip_fraction         | 0.117        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.371       |
|    agent/train/explained_variance    | 0.914        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00257     |
|    agent/train/n_updates             | 520          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 449          |
|    agent/rollout/ep_rew_wrapped_mean | 290          |
|    agent/time/fps                    | 4048         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 110592       |
|    agent/train/approx_kl             | 0.0016422617 |
|    agent/train/clip_fraction         | 0.115        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.345       |
|    agent/train/explained_variance    | 0.966        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00722      |
|    agent/train/n_updates             | 530          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 449          |
|    agent/rollout/ep_rew_wrapped_mean | 293          |
|    agent/time/fps                    | 4238         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 112640       |
|    agent/train/approx_kl             | 0.0014646519 |
|    agent/train/clip_fraction         | 0.0828       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.336       |
|    agent/train/explained_variance    | 0.9          |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0065      |
|    agent/train/n_updates             | 540          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 456          |
|    agent/rollout/ep_rew_wrapped_mean | 297          |
|    agent/time/fps                    | 3559         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 114688       |
|    agent/train/approx_kl             | 0.0036279932 |
|    agent/train/clip_fraction         | 0.11         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.345       |
|    agent/train/explained_variance    | 0.923        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00268      |
|    agent/train/n_updates             | 550          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 456          |
|    agent/rollout/ep_rew_wrapped_mean | 305          |
|    agent/time/fps                    | 1959         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 1            |
|    agent/time/total_timesteps        | 116736       |
|    agent/train/approx_kl             | 0.0037890738 |
|    agent/train/clip_fraction         | 0.115        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.343       |
|    agent/train/explained_variance    | 0.896        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0127      |
|    agent/train/n_updates             | 560          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 456          |
|    agent/rollout/ep_rew_wrapped_mean | 314          |
|    agent/time/fps                    | 3923         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 118784       |
|    agent/train/approx_kl             | 0.0069648344 |
|    agent/train/clip_fraction         | 0.128        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.33        |
|    agent/train/explained_variance    | 0.942        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00902     |
|    agent/train/n_updates             | 570          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 456          |
|    agent/rollout/ep_rew_wrapped_mean | 318          |
|    agent/time/fps                    | 3930         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 120832       |
|    agent/train/approx_kl             | 0.0023174905 |
|    agent/train/clip_fraction         | 0.149        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.33        |
|    agent/train/explained_variance    | 0.835        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00584      |
|    agent/train/n_updates             | 580          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 103 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 456         |
|    agent/rollout/ep_rew_wrapped_mean | 328         |
|    agent/time/fps                    | 4179        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 122880      |
|    agent/train/approx_kl             | 0.003270393 |
|    agent/train/clip_fraction         | 0.106       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.314      |
|    agent/train/explained_variance    | 0.91        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00434    |
|    agent/train/n_updates             | 590         |
|    agent/train/policy_gradient

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 458          |
|    agent/rollout/ep_rew_wrapped_mean | 338          |
|    agent/time/fps                    | 4222         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 124928       |
|    agent/train/approx_kl             | 0.0072393497 |
|    agent/train/clip_fraction         | 0.146        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.304       |
|    agent/train/explained_variance    | 0.769        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0121      |
|    agent/train/n_updates             | 600          |
|    agent/train



VBox(children=(Label(value='0.112 MB of 0.112 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▁▁▂▄▅▆██████████████████████████████████
time/fps,█▂▂▁▁▁▁▁▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/approx_kl,▂▅▇▅▄▂▃▃▃▄▅▅▆▂▂▄▇▅▂▄▄▃▃▂▃▁▄▂▂▄▂▂▃▂▂▃█▃▁▃
train/clip_fraction,▂▅█▅▃▃▃▃▃▃▃▂▂▂▁▂▂▂▁▂▂▂▁▁▁▁▂▁▁▂▂▂▂▁▁▁▂▁▁▂
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▃▄▅▆▆▇▇▇▇▇▇███████████████████████████
train/explained_variance,▁▇█▇████████████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,500.0
time/fps,2868.0
train/approx_kl,0.00222
train/clip_fraction,0.05566
train/clip_range,0.1
train/entropy_loss,-0.12147
train/explained_variance,0.99874
train/learning_rate,0.002


 20%|████████▊                                   | 1/5 [06:12<24:50, 372.52s/it]

Query schedule: [50, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5]
Collecting 100 fragments (20000 transitions)
Requested 12000 transitions but only 0 in buffer. Sampling 12000 additional transitions.
Sampling 8000 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 50 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 34       |
|    agent/rollout/ep_rew_wrapped_mean | 323      |
|    agent/time/fps                    | 3904     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 34       |
|    agent/rollout/ep_rew_wrapped_mean | 323      |
|    agent/time/fps                    | 3.9e+03  |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 40.5         |
|    agent/rollout/ep_rew_wrapped_mean | 272          |
|    agent/time/fps                    | 4071         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0026239913 |
|    agent/train/clip_fraction         | 0.102        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.691       |
|    agent/train/explained_variance    | 0.469        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.000275     |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 38.8         |
|    agent/rollout/ep_rew_wrapped_mean | 238          |
|    agent/time/fps                    | 4208         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0035945312 |
|    agent/train/clip_fraction         | 0.194        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.684       |
|    agent/train/explained_variance    | 0.706        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00821      |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 36.6         |
|    agent/rollout/ep_rew_wrapped_mean | 216          |
|    agent/time/fps                    | 4327         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0028663338 |
|    agent/train/clip_fraction         | 0.166        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.689       |
|    agent/train/explained_variance    | 0.796        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00333     |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.9         |
|    agent/rollout/ep_rew_wrapped_mean | 199          |
|    agent/time/fps                    | 4479         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0029046135 |
|    agent/train/clip_fraction         | 0.146        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.687       |
|    agent/train/explained_variance    | 0.514        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0196      |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.2         |
|    agent/rollout/ep_rew_wrapped_mean | 185          |
|    agent/time/fps                    | 4322         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0035199742 |
|    agent/train/clip_fraction         | 0.165        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.679       |
|    agent/train/explained_variance    | 0.868        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00361     |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.6         |
|    agent/rollout/ep_rew_wrapped_mean | 174          |
|    agent/time/fps                    | 4154         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0031860585 |
|    agent/train/clip_fraction         | 0.214        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.677       |
|    agent/train/explained_variance    | 0.836        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00454     |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.3         |
|    agent/rollout/ep_rew_wrapped_mean | 156          |
|    agent/time/fps                    | 3983         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0032372102 |
|    agent/train/clip_fraction         | 0.186        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.669       |
|    agent/train/explained_variance    | 0.877        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.012       |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.6         |
|    agent/rollout/ep_rew_wrapped_mean | 132          |
|    agent/time/fps                    | 4103         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0022058147 |
|    agent/train/clip_fraction         | 0.0993       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.675       |
|    agent/train/explained_variance    | 0.803        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00906     |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.6         |
|    agent/rollout/ep_rew_wrapped_mean | 105          |
|    agent/time/fps                    | 4374         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0022582614 |
|    agent/train/clip_fraction         | 0.121        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.667       |
|    agent/train/explained_variance    | 0.863        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00307      |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.6         |
|    agent/rollout/ep_rew_wrapped_mean | 85.2         |
|    agent/time/fps                    | 4378         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0030380515 |
|    agent/train/clip_fraction         | 0.167        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.673       |
|    agent/train/explained_variance    | 0.826        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0126       |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 34           |
|    agent/rollout/ep_rew_wrapped_mean | 70.7         |
|    agent/time/fps                    | 4375         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0029198762 |
|    agent/train/clip_fraction         | 0.203        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.658       |
|    agent/train/explained_variance    | 0.893        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00956      |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.9         |
|    agent/rollout/ep_rew_wrapped_mean | 71.7         |
|    agent/time/fps                    | 3710         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 26624        |
|    agent/train/approx_kl             | 0.0022344613 |
|    agent/train/clip_fraction         | 0.145        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.659       |
|    agent/train/explained_variance    | 0.799        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00534      |
|    agent/train/n_updates             | 120          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 34.1        |
|    agent/rollout/ep_rew_wrapped_mean | 73.4        |
|    agent/time/fps                    | 3765        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 28672       |
|    agent/train/approx_kl             | 0.004688849 |
|    agent/train/clip_fraction         | 0.241       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.641      |
|    agent/train/explained_variance    | 0.935       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00807     |
|    agent/train/n_updates             | 130         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.9         |
|    agent/rollout/ep_rew_wrapped_mean | 74.9         |
|    agent/time/fps                    | 4210         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0038992956 |
|    agent/train/clip_fraction         | 0.222        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.657       |
|    agent/train/explained_variance    | 0.96         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.000537     |
|    agent/train/n_updates             | 140          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.6         |
|    agent/rollout/ep_rew_wrapped_mean | 77.8         |
|    agent/time/fps                    | 4147         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0028858453 |
|    agent/train/clip_fraction         | 0.171        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.636       |
|    agent/train/explained_variance    | 0.811        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0447       |
|    agent/train/n_updates             | 150          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.9         |
|    agent/rollout/ep_rew_wrapped_mean | 80.7         |
|    agent/time/fps                    | 3730         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0040348535 |
|    agent/train/clip_fraction         | 0.223        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.622       |
|    agent/train/explained_variance    | 0.741        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.055        |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 35.1         |
|    agent/rollout/ep_rew_wrapped_mean | 83.5         |
|    agent/time/fps                    | 4226         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 36864        |
|    agent/train/approx_kl             | 0.0022491042 |
|    agent/train/clip_fraction         | 0.106        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.599       |
|    agent/train/explained_variance    | 0.853        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0289       |
|    agent/train/n_updates             | 170          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 35.6         |
|    agent/rollout/ep_rew_wrapped_mean | 87           |
|    agent/time/fps                    | 3596         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 38912        |
|    agent/train/approx_kl             | 0.0038404015 |
|    agent/train/clip_fraction         | 0.196        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.623       |
|    agent/train/explained_variance    | 0.961        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0144       |
|    agent/train/n_updates             | 180          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 37          |
|    agent/rollout/ep_rew_wrapped_mean | 89.8        |
|    agent/time/fps                    | 3680        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 40960       |
|    agent/train/approx_kl             | 0.005038012 |
|    agent/train/clip_fraction         | 0.173       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.597      |
|    agent/train/explained_variance    | 0.96        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.000248   |
|    agent/train/n_updates             | 190         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 37.1         |
|    agent/rollout/ep_rew_wrapped_mean | 92.6         |
|    agent/time/fps                    | 3853         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0034597507 |
|    agent/train/clip_fraction         | 0.134        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.584       |
|    agent/train/explained_variance    | 0.963        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0321       |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 38.3         |
|    agent/rollout/ep_rew_wrapped_mean | 95.5         |
|    agent/time/fps                    | 4053         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 45056        |
|    agent/train/approx_kl             | 0.0023465478 |
|    agent/train/clip_fraction         | 0.106        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.562       |
|    agent/train/explained_variance    | 0.928        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0156       |
|    agent/train/n_updates             | 210          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 39.1         |
|    agent/rollout/ep_rew_wrapped_mean | 99.5         |
|    agent/time/fps                    | 3788         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 47104        |
|    agent/train/approx_kl             | 0.0030519278 |
|    agent/train/clip_fraction         | 0.132        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.578       |
|    agent/train/explained_variance    | 0.959        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0436       |
|    agent/train/n_updates             | 220          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 40.4        |
|    agent/rollout/ep_rew_wrapped_mean | 104         |
|    agent/time/fps                    | 3519        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 49152       |
|    agent/train/approx_kl             | 0.004022381 |
|    agent/train/clip_fraction         | 0.154       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.578      |
|    agent/train/explained_variance    | 0.98        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00249     |
|    agent/train/n_updates             | 230         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 42.6        |
|    agent/rollout/ep_rew_wrapped_mean | 108         |
|    agent/time/fps                    | 3493        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 51200       |
|    agent/train/approx_kl             | 0.006551111 |
|    agent/train/clip_fraction         | 0.236       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.554      |
|    agent/train/explained_variance    | 0.972       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0131      |
|    agent/train/n_updates             | 240         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 43.7        |
|    agent/rollout/ep_rew_wrapped_mean | 114         |
|    agent/time/fps                    | 4066        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 53248       |
|    agent/train/approx_kl             | 0.003533236 |
|    agent/train/clip_fraction         | 0.139       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.564      |
|    agent/train/explained_variance    | 0.967       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0277      |
|    agent/train/n_updates             | 250         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 45           |
|    agent/rollout/ep_rew_wrapped_mean | 119          |
|    agent/time/fps                    | 4157         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 55296        |
|    agent/train/approx_kl             | 0.0026298673 |
|    agent/train/clip_fraction         | 0.155        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.579       |
|    agent/train/explained_variance    | 0.98         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0194       |
|    agent/train/n_updates             | 260          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 46.3        |
|    agent/rollout/ep_rew_wrapped_mean | 124         |
|    agent/time/fps                    | 3206        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 57344       |
|    agent/train/approx_kl             | 0.006012883 |
|    agent/train/clip_fraction         | 0.242       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.559      |
|    agent/train/explained_variance    | 0.988       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00651     |
|    agent/train/n_updates             | 270         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 47.6         |
|    agent/rollout/ep_rew_wrapped_mean | 127          |
|    agent/time/fps                    | 3189         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 59392        |
|    agent/train/approx_kl             | 0.0031772596 |
|    agent/train/clip_fraction         | 0.117        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.59        |
|    agent/train/explained_variance    | 0.952        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0832       |
|    agent/train/n_updates             | 280          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 49.3         |
|    agent/rollout/ep_rew_wrapped_mean | 131          |
|    agent/time/fps                    | 3797         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 61440        |
|    agent/train/approx_kl             | 0.0033583958 |
|    agent/train/clip_fraction         | 0.161        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.576       |
|    agent/train/explained_variance    | 0.98         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.129        |
|    agent/train/n_updates             | 290          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 52.4        |
|    agent/rollout/ep_rew_wrapped_mean | 134         |
|    agent/time/fps                    | 3905        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 63488       |
|    agent/train/approx_kl             | 0.005331707 |
|    agent/train/clip_fraction         | 0.194       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.546      |
|    agent/train/explained_variance    | 0.987       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0973      |
|    agent/train/n_updates             | 300         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 55.1        |
|    agent/rollout/ep_rew_wrapped_mean | 139         |
|    agent/time/fps                    | 3719        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 65536       |
|    agent/train/approx_kl             | 0.001936235 |
|    agent/train/clip_fraction         | 0.127       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.507      |
|    agent/train/explained_variance    | 0.986       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.776       |
|    agent/train/n_updates             | 310         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 56.9         |
|    agent/rollout/ep_rew_wrapped_mean | 144          |
|    agent/time/fps                    | 3667         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 67584        |
|    agent/train/approx_kl             | 0.0044038324 |
|    agent/train/clip_fraction         | 0.161        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.503       |
|    agent/train/explained_variance    | 0.984        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0612       |
|    agent/train/n_updates             | 320          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 60.7        |
|    agent/rollout/ep_rew_wrapped_mean | 148         |
|    agent/time/fps                    | 4018        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 69632       |
|    agent/train/approx_kl             | 0.003857457 |
|    agent/train/clip_fraction         | 0.164       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.505      |
|    agent/train/explained_variance    | 0.992       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0538      |
|    agent/train/n_updates             | 330         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 65.8        |
|    agent/rollout/ep_rew_wrapped_mean | 154         |
|    agent/time/fps                    | 4344        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 71680       |
|    agent/train/approx_kl             | 0.004295567 |
|    agent/train/clip_fraction         | 0.193       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.468      |
|    agent/train/explained_variance    | 0.99        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0382      |
|    agent/train/n_updates             | 340         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 69           |
|    agent/rollout/ep_rew_wrapped_mean | 162          |
|    agent/time/fps                    | 4189         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 73728        |
|    agent/train/approx_kl             | 0.0037415284 |
|    agent/train/clip_fraction         | 0.143        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.488       |
|    agent/train/explained_variance    | 0.987        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.243        |
|    agent/train/n_updates             | 350          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 73.7        |
|    agent/rollout/ep_rew_wrapped_mean | 167         |
|    agent/time/fps                    | 3553        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 75776       |
|    agent/train/approx_kl             | 0.004139443 |
|    agent/train/clip_fraction         | 0.195       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.466      |
|    agent/train/explained_variance    | 0.995       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0591      |
|    agent/train/n_updates             | 360         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 78.9        |
|    agent/rollout/ep_rew_wrapped_mean | 172         |
|    agent/time/fps                    | 4043        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 77824       |
|    agent/train/approx_kl             | 0.005275974 |
|    agent/train/clip_fraction         | 0.175       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.473      |
|    agent/train/explained_variance    | 0.991       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0291      |
|    agent/train/n_updates             | 370         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 84.5         |
|    agent/rollout/ep_rew_wrapped_mean | 179          |
|    agent/time/fps                    | 3768         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 79872        |
|    agent/train/approx_kl             | 0.0038955975 |
|    agent/train/clip_fraction         | 0.153        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.483       |
|    agent/train/explained_variance    | 0.991        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0514       |
|    agent/train/n_updates             | 380          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 93          |
|    agent/rollout/ep_rew_wrapped_mean | 185         |
|    agent/time/fps                    | 4005        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 81920       |
|    agent/train/approx_kl             | 0.003614171 |
|    agent/train/clip_fraction         | 0.135       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.472      |
|    agent/train/explained_variance    | 0.983       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0739      |
|    agent/train/n_updates             | 390         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 102          |
|    agent/rollout/ep_rew_wrapped_mean | 194          |
|    agent/time/fps                    | 3823         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 83968        |
|    agent/train/approx_kl             | 0.0038265276 |
|    agent/train/clip_fraction         | 0.153        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.454       |
|    agent/train/explained_variance    | 0.955        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0963       |
|    agent/train/n_updates             | 400          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 108         |
|    agent/rollout/ep_rew_wrapped_mean | 203         |
|    agent/time/fps                    | 3871        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 86016       |
|    agent/train/approx_kl             | 0.003955071 |
|    agent/train/clip_fraction         | 0.152       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.466      |
|    agent/train/explained_variance    | 0.985       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0801      |
|    agent/train/n_updates             | 410         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 117          |
|    agent/rollout/ep_rew_wrapped_mean | 213          |
|    agent/time/fps                    | 3961         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 88064        |
|    agent/train/approx_kl             | 0.0069299582 |
|    agent/train/clip_fraction         | 0.159        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.454       |
|    agent/train/explained_variance    | 0.983        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.136        |
|    agent/train/n_updates             | 420          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 127         |
|    agent/rollout/ep_rew_wrapped_mean | 224         |
|    agent/time/fps                    | 4214        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 90112       |
|    agent/train/approx_kl             | 0.004866665 |
|    agent/train/clip_fraction         | 0.205       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.45       |
|    agent/train/explained_variance    | 0.988       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0943      |
|    agent/train/n_updates             | 430         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 137         |
|    agent/rollout/ep_rew_wrapped_mean | 237         |
|    agent/time/fps                    | 3913        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 92160       |
|    agent/train/approx_kl             | 0.006534801 |
|    agent/train/clip_fraction         | 0.194       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.484      |
|    agent/train/explained_variance    | 0.988       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.144       |
|    agent/train/n_updates             | 440         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 144         |
|    agent/rollout/ep_rew_wrapped_mean | 249         |
|    agent/time/fps                    | 3989        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 94208       |
|    agent/train/approx_kl             | 0.004464408 |
|    agent/train/clip_fraction         | 0.148       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.515      |
|    agent/train/explained_variance    | 0.989       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.108       |
|    agent/train/n_updates             | 450         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 156         |
|    agent/rollout/ep_rew_wrapped_mean | 257         |
|    agent/time/fps                    | 4182        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 96256       |
|    agent/train/approx_kl             | 0.004929157 |
|    agent/train/clip_fraction         | 0.168       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.527      |
|    agent/train/explained_variance    | 0.98        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0501      |
|    agent/train/n_updates             | 460         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 162         |
|    agent/rollout/ep_rew_wrapped_mean | 269         |
|    agent/time/fps                    | 4024        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 98304       |
|    agent/train/approx_kl             | 0.004403756 |
|    agent/train/clip_fraction         | 0.183       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.524      |
|    agent/train/explained_variance    | 0.986       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0453      |
|    agent/train/n_updates             | 470         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 178         |
|    agent/rollout/ep_rew_wrapped_mean | 279         |
|    agent/time/fps                    | 3930        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 100352      |
|    agent/train/approx_kl             | 0.006070449 |
|    agent/train/clip_fraction         | 0.176       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.518      |
|    agent/train/explained_variance    | 0.994       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.11        |
|    agent/train/n_updates             | 480         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 189          |
|    agent/rollout/ep_rew_wrapped_mean | 298          |
|    agent/time/fps                    | 3699         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 102400       |
|    agent/train/approx_kl             | 0.0043092715 |
|    agent/train/clip_fraction         | 0.194        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.508       |
|    agent/train/explained_variance    | 0.978        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.126        |
|    agent/train/n_updates             | 490          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 207         |
|    agent/rollout/ep_rew_wrapped_mean | 310         |
|    agent/time/fps                    | 4118        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 104448      |
|    agent/train/approx_kl             | 0.005987551 |
|    agent/train/clip_fraction         | 0.204       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.523      |
|    agent/train/explained_variance    | 0.976       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.106       |
|    agent/train/n_updates             | 500         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 223         |
|    agent/rollout/ep_rew_wrapped_mean | 327         |
|    agent/time/fps                    | 3788        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 106496      |
|    agent/train/approx_kl             | 0.004201405 |
|    agent/train/clip_fraction         | 0.208       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.472      |
|    agent/train/explained_variance    | 0.812       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0113      |
|    agent/train/n_updates             | 510         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 239          |
|    agent/rollout/ep_rew_wrapped_mean | 345          |
|    agent/time/fps                    | 3949         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 108544       |
|    agent/train/approx_kl             | 0.0044970047 |
|    agent/train/clip_fraction         | 0.249        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.469       |
|    agent/train/explained_variance    | 0.534        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00875      |
|    agent/train/n_updates             | 520          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 255          |
|    agent/rollout/ep_rew_wrapped_mean | 365          |
|    agent/time/fps                    | 3939         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 110592       |
|    agent/train/approx_kl             | 0.0021368717 |
|    agent/train/clip_fraction         | 0.108        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.508       |
|    agent/train/explained_variance    | 0.968        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0396       |
|    agent/train/n_updates             | 530          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 272          |
|    agent/rollout/ep_rew_wrapped_mean | 380          |
|    agent/time/fps                    | 4037         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 112640       |
|    agent/train/approx_kl             | 0.0032344512 |
|    agent/train/clip_fraction         | 0.148        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.499       |
|    agent/train/explained_variance    | 0.981        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0335       |
|    agent/train/n_updates             | 540          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 289         |
|    agent/rollout/ep_rew_wrapped_mean | 395         |
|    agent/time/fps                    | 3838        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 114688      |
|    agent/train/approx_kl             | 0.004970251 |
|    agent/train/clip_fraction         | 0.146       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.462      |
|    agent/train/explained_variance    | 0.748       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00589     |
|    agent/train/n_updates             | 550         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 305          |
|    agent/rollout/ep_rew_wrapped_mean | 410          |
|    agent/time/fps                    | 4420         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 116736       |
|    agent/train/approx_kl             | 0.0050793756 |
|    agent/train/clip_fraction         | 0.147        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.459       |
|    agent/train/explained_variance    | 0.941        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0207       |
|    agent/train/n_updates             | 560          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 320         |
|    agent/rollout/ep_rew_wrapped_mean | 422         |
|    agent/time/fps                    | 3995        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 118784      |
|    agent/train/approx_kl             | 0.004211786 |
|    agent/train/clip_fraction         | 0.151       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.444      |
|    agent/train/explained_variance    | 0.969       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00162    |
|    agent/train/n_updates             | 570         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 335          |
|    agent/rollout/ep_rew_wrapped_mean | 433          |
|    agent/time/fps                    | 3643         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 120832       |
|    agent/train/approx_kl             | 0.0033861022 |
|    agent/train/clip_fraction         | 0.127        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.436       |
|    agent/train/explained_variance    | 0.984        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.018        |
|    agent/train/n_updates             | 580          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 103 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 349         |
|    agent/rollout/ep_rew_wrapped_mean | 445         |
|    agent/time/fps                    | 4197        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 122880      |
|    agent/train/approx_kl             | 0.004518016 |
|    agent/train/clip_fraction         | 0.165       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.445      |
|    agent/train/explained_variance    | 0.93        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0159     |
|    agent/train/n_updates             | 590         |
|    agent/train/policy_gradient

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 364         |
|    agent/rollout/ep_rew_wrapped_mean | 451         |
|    agent/time/fps                    | 4118        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 124928      |
|    agent/train/approx_kl             | 0.004601646 |
|    agent/train/clip_fraction         | 0.185       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.435      |
|    agent/train/explained_variance    | 0.906       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00376    |
|    agent/train/n_updates             | 600         |
|    agent/train/policy_gradient_



VBox(children=(Label(value='0.110 MB of 0.110 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▁▁▁▂▃▅▆▇▇███████████████████████████████
time/fps,█▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
train/approx_kl,▂▂▂▂▁▄▃▃▄▃▁▆▁▃▄▃▂▃█▄▄▂▄▃▃▆▄▄▃▂▅▇▁▁▄▃▃▄▄▂
train/clip_fraction,▃▃▇█▄▇▅▄▄▄▃▆▃▂▄▅▃▃▅▃▃▂▃▃▃▄▃▃▃▂▃▃▃▁▃▂▃▂▂▁
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▁▂▂▄▅▆▇▇▇▇▇▇▇█▇████████████████████████
train/explained_variance,▁▇▇█████████████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,500.0
time/fps,2825.0
train/approx_kl,0.00332
train/clip_fraction,0.08174
train/clip_range,0.1
train/entropy_loss,-0.1708
train/explained_variance,0.9997
train/learning_rate,0.002


 40%|█████████████████▌                          | 2/5 [12:37<18:59, 379.70s/it]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011225871299393475, max=1.0…

Query schedule: [50, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5]
Collecting 100 fragments (20000 transitions)
Requested 12000 transitions but only 0 in buffer. Sampling 12000 additional transitions.
Sampling 8000 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 50 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 21       |
|    agent/rollout/ep_rew_wrapped_mean | -30.2    |
|    agent/time/fps                    | 4238     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 21       |
|    agent/rollout/ep_rew_wrapped_mean | -30.2    |
|    agent/time/fps                    | 4.24e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.6         |
|    agent/rollout/ep_rew_wrapped_mean | -36.8        |
|    agent/time/fps                    | 4364         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0011457308 |
|    agent/train/clip_fraction         | 0.0485       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.692       |
|    agent/train/explained_variance    | -0.436       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0123       |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.2         |
|    agent/rollout/ep_rew_wrapped_mean | -39.7        |
|    agent/time/fps                    | 4244         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0019724984 |
|    agent/train/clip_fraction         | 0.106        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.688       |
|    agent/train/explained_variance    | 0.657        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.000825     |
|    agent/train/n_updates             | 20           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.3         |
|    agent/rollout/ep_rew_wrapped_mean | -43.6        |
|    agent/time/fps                    | 4443         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0033468537 |
|    agent/train/clip_fraction         | 0.17         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.679       |
|    agent/train/explained_variance    | 0.686        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0236       |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 31.1         |
|    agent/rollout/ep_rew_wrapped_mean | -46.7        |
|    agent/time/fps                    | 4246         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0039270665 |
|    agent/train/clip_fraction         | 0.179        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.663       |
|    agent/train/explained_variance    | 0.886        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0126       |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.8         |
|    agent/rollout/ep_rew_wrapped_mean | -48          |
|    agent/time/fps                    | 4210         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0035344816 |
|    agent/train/clip_fraction         | 0.187        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.672       |
|    agent/train/explained_variance    | 0.825        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0062      |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.6         |
|    agent/rollout/ep_rew_wrapped_mean | -49.4        |
|    agent/time/fps                    | 4467         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0037335681 |
|    agent/train/clip_fraction         | 0.219        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.652       |
|    agent/train/explained_variance    | 0.887        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0286       |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 30.9        |
|    agent/rollout/ep_rew_wrapped_mean | -53.6       |
|    agent/time/fps                    | 4385        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 16384       |
|    agent/train/approx_kl             | 0.004273042 |
|    agent/train/clip_fraction         | 0.22        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.633      |
|    agent/train/explained_variance    | 0.941       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0102     |
|    agent/train/n_updates             | 70          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 30.3       |
|    agent/rollout/ep_rew_wrapped_mean | -56.5      |
|    agent/time/fps                    | 4096       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 18432      |
|    agent/train/approx_kl             | 0.00469116 |
|    agent/train/clip_fraction         | 0.241      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.618     |
|    agent/train/explained_variance    | 0.828      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.0243     |
|    agent/train/n_updates             | 80         |
|    agent/train/policy_gradient_loss  | -0.00878 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.6         |
|    agent/rollout/ep_rew_wrapped_mean | -63.7        |
|    agent/time/fps                    | 4411         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0069228797 |
|    agent/train/clip_fraction         | 0.246        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.55        |
|    agent/train/explained_variance    | 0.882        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00641      |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 30.2         |
|    agent/rollout/ep_rew_wrapped_mean | -67.5        |
|    agent/time/fps                    | 4064         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0069801845 |
|    agent/train/clip_fraction         | 0.202        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.593       |
|    agent/train/explained_variance    | 0.812        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00517     |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 29.8       |
|    agent/rollout/ep_rew_wrapped_mean | -69.5      |
|    agent/time/fps                    | 4226       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 24576      |
|    agent/train/approx_kl             | 0.01354083 |
|    agent/train/clip_fraction         | 0.222      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.502     |
|    agent/train/explained_variance    | 0.936      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.0213     |
|    agent/train/n_updates             | 110        |
|    agent/train/policy_gradient_loss  | -0.0119  

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 29.9       |
|    agent/rollout/ep_rew_wrapped_mean | -71        |
|    agent/time/fps                    | 4302       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 26624      |
|    agent/train/approx_kl             | 0.00824396 |
|    agent/train/clip_fraction         | 0.0976     |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.385     |
|    agent/train/explained_variance    | 0.918      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.00552    |
|    agent/train/n_updates             | 120        |
|    agent/train/policy_gradient_loss  | -0.00228 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29.8        |
|    agent/rollout/ep_rew_wrapped_mean | -71.9       |
|    agent/time/fps                    | 4394        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 28672       |
|    agent/train/approx_kl             | 0.015760824 |
|    agent/train/clip_fraction         | 0.217       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.391      |
|    agent/train/explained_variance    | 0.962       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00103     |
|    agent/train/n_updates             | 130         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29.5        |
|    agent/rollout/ep_rew_wrapped_mean | -72.2       |
|    agent/time/fps                    | 4244        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 30720       |
|    agent/train/approx_kl             | 0.007839387 |
|    agent/train/clip_fraction         | 0.13        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.339      |
|    agent/train/explained_variance    | 0.966       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00419    |
|    agent/train/n_updates             | 140         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.1         |
|    agent/rollout/ep_rew_wrapped_mean | -71.3        |
|    agent/time/fps                    | 4138         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0035051021 |
|    agent/train/clip_fraction         | 0.121        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.37        |
|    agent/train/explained_variance    | 0.863        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0198       |
|    agent/train/n_updates             | 150          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29           |
|    agent/rollout/ep_rew_wrapped_mean | -69.7        |
|    agent/time/fps                    | 4065         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0068127317 |
|    agent/train/clip_fraction         | 0.0967       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.339       |
|    agent/train/explained_variance    | 0.95         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.021        |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 28.9        |
|    agent/rollout/ep_rew_wrapped_mean | -68.3       |
|    agent/time/fps                    | 3365        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 36864       |
|    agent/train/approx_kl             | 0.004120633 |
|    agent/train/clip_fraction         | 0.0837      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.264      |
|    agent/train/explained_variance    | 0.949       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.014       |
|    agent/train/n_updates             | 170         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29           |
|    agent/rollout/ep_rew_wrapped_mean | -66.2        |
|    agent/time/fps                    | 4085         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 38912        |
|    agent/train/approx_kl             | 0.0032378074 |
|    agent/train/clip_fraction         | 0.0628       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.228       |
|    agent/train/explained_variance    | 0.956        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0135       |
|    agent/train/n_updates             | 180          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.7         |
|    agent/rollout/ep_rew_wrapped_mean | -64.2        |
|    agent/time/fps                    | 4302         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 40960        |
|    agent/train/approx_kl             | 0.0019330434 |
|    agent/train/clip_fraction         | 0.0745       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.321       |
|    agent/train/explained_variance    | 0.923        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0215       |
|    agent/train/n_updates             | 190          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 28.8        |
|    agent/rollout/ep_rew_wrapped_mean | -62         |
|    agent/time/fps                    | 3963        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 43008       |
|    agent/train/approx_kl             | 0.003026755 |
|    agent/train/clip_fraction         | 0.0766      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.165      |
|    agent/train/explained_variance    | 0.97        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0225      |
|    agent/train/n_updates             | 200         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 28.9         |
|    agent/rollout/ep_rew_wrapped_mean | -59.4        |
|    agent/time/fps                    | 3926         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 45056        |
|    agent/train/approx_kl             | 0.0030931046 |
|    agent/train/clip_fraction         | 0.111        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.343       |
|    agent/train/explained_variance    | 0.963        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0182       |
|    agent/train/n_updates             | 210          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29.2        |
|    agent/rollout/ep_rew_wrapped_mean | -56.8       |
|    agent/time/fps                    | 3537        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 47104       |
|    agent/train/approx_kl             | 0.001752766 |
|    agent/train/clip_fraction         | 0.0662      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.351      |
|    agent/train/explained_variance    | 0.934       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0272      |
|    agent/train/n_updates             | 220         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 29.8       |
|    agent/rollout/ep_rew_wrapped_mean | -54        |
|    agent/time/fps                    | 4107       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 49152      |
|    agent/train/approx_kl             | 0.00255602 |
|    agent/train/clip_fraction         | 0.0948     |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.305     |
|    agent/train/explained_variance    | 0.982      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.0216     |
|    agent/train/n_updates             | 230        |
|    agent/train/policy_gradient_loss  | -0.0044  

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 32          |
|    agent/rollout/ep_rew_wrapped_mean | -51.9       |
|    agent/time/fps                    | 4131        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 51200       |
|    agent/train/approx_kl             | 0.004024891 |
|    agent/train/clip_fraction         | 0.158       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.495      |
|    agent/train/explained_variance    | 0.976       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0142      |
|    agent/train/n_updates             | 240         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 36          |
|    agent/rollout/ep_rew_wrapped_mean | -48.7       |
|    agent/time/fps                    | 4141        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 53248       |
|    agent/train/approx_kl             | 0.006198363 |
|    agent/train/clip_fraction         | 0.133       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.418      |
|    agent/train/explained_variance    | 0.955       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0147      |
|    agent/train/n_updates             | 250         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 40.2         |
|    agent/rollout/ep_rew_wrapped_mean | -44.6        |
|    agent/time/fps                    | 3860         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 55296        |
|    agent/train/approx_kl             | 0.0052800216 |
|    agent/train/clip_fraction         | 0.148        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.405       |
|    agent/train/explained_variance    | 0.984        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0456       |
|    agent/train/n_updates             | 260          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 47.4        |
|    agent/rollout/ep_rew_wrapped_mean | -40.2       |
|    agent/time/fps                    | 3694        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 57344       |
|    agent/train/approx_kl             | 0.004762454 |
|    agent/train/clip_fraction         | 0.162       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.519      |
|    agent/train/explained_variance    | 0.976       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.02        |
|    agent/train/n_updates             | 270         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 51.8         |
|    agent/rollout/ep_rew_wrapped_mean | -34.6        |
|    agent/time/fps                    | 4127         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 59392        |
|    agent/train/approx_kl             | 0.0036350035 |
|    agent/train/clip_fraction         | 0.17         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.6         |
|    agent/train/explained_variance    | 0.937        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00493      |
|    agent/train/n_updates             | 280          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 57.9         |
|    agent/rollout/ep_rew_wrapped_mean | -30.8        |
|    agent/time/fps                    | 3625         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 61440        |
|    agent/train/approx_kl             | 0.0017483982 |
|    agent/train/clip_fraction         | 0.12         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.596       |
|    agent/train/explained_variance    | 0.983        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00513     |
|    agent/train/n_updates             | 290          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 63.2         |
|    agent/rollout/ep_rew_wrapped_mean | -26.7        |
|    agent/time/fps                    | 3865         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 63488        |
|    agent/train/approx_kl             | 0.0011454627 |
|    agent/train/clip_fraction         | 0.0822       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.6         |
|    agent/train/explained_variance    | 0.989        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00113     |
|    agent/train/n_updates             | 300          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 71.1         |
|    agent/rollout/ep_rew_wrapped_mean | -23.9        |
|    agent/time/fps                    | 3655         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 65536        |
|    agent/train/approx_kl             | 0.0018921061 |
|    agent/train/clip_fraction         | 0.114        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.597       |
|    agent/train/explained_variance    | 0.992        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00396     |
|    agent/train/n_updates             | 310          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 76.1         |
|    agent/rollout/ep_rew_wrapped_mean | -20.3        |
|    agent/time/fps                    | 4110         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 67584        |
|    agent/train/approx_kl             | 0.0062429784 |
|    agent/train/clip_fraction         | 0.173        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.61        |
|    agent/train/explained_variance    | 0.992        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0212       |
|    agent/train/n_updates             | 320          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 81.8        |
|    agent/rollout/ep_rew_wrapped_mean | -16.1       |
|    agent/time/fps                    | 4272        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 69632       |
|    agent/train/approx_kl             | 0.002100702 |
|    agent/train/clip_fraction         | 0.138       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.605      |
|    agent/train/explained_variance    | 0.993       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0185     |
|    agent/train/n_updates             | 330         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 92.4         |
|    agent/rollout/ep_rew_wrapped_mean | -11.5        |
|    agent/time/fps                    | 3836         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 71680        |
|    agent/train/approx_kl             | 0.0030993554 |
|    agent/train/clip_fraction         | 0.0998       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.6         |
|    agent/train/explained_variance    | 0.988        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0124       |
|    agent/train/n_updates             | 340          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 105          |
|    agent/rollout/ep_rew_wrapped_mean | -4.63        |
|    agent/time/fps                    | 3120         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 73728        |
|    agent/train/approx_kl             | 0.0029756608 |
|    agent/train/clip_fraction         | 0.186        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.593       |
|    agent/train/explained_variance    | 0.983        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.000932     |
|    agent/train/n_updates             | 350          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 117         |
|    agent/rollout/ep_rew_wrapped_mean | 5.77        |
|    agent/time/fps                    | 3988        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 75776       |
|    agent/train/approx_kl             | 0.002133347 |
|    agent/train/clip_fraction         | 0.102       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.595      |
|    agent/train/explained_variance    | 0.876       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00934     |
|    agent/train/n_updates             | 360         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 136         |
|    agent/rollout/ep_rew_wrapped_mean | 16.2        |
|    agent/time/fps                    | 4336        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 77824       |
|    agent/train/approx_kl             | 0.002741702 |
|    agent/train/clip_fraction         | 0.16        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.586      |
|    agent/train/explained_variance    | 0.979       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0102      |
|    agent/train/n_updates             | 370         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 155         |
|    agent/rollout/ep_rew_wrapped_mean | 28.7        |
|    agent/time/fps                    | 3998        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 79872       |
|    agent/train/approx_kl             | 0.004378829 |
|    agent/train/clip_fraction         | 0.153       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.572      |
|    agent/train/explained_variance    | 0.661       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00789     |
|    agent/train/n_updates             | 380         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 172          |
|    agent/rollout/ep_rew_wrapped_mean | 39.8         |
|    agent/time/fps                    | 3645         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 81920        |
|    agent/train/approx_kl             | 0.0037970366 |
|    agent/train/clip_fraction         | 0.12         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.559       |
|    agent/train/explained_variance    | 0.841        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00431     |
|    agent/train/n_updates             | 390          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 187          |
|    agent/rollout/ep_rew_wrapped_mean | 50.5         |
|    agent/time/fps                    | 4333         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 83968        |
|    agent/train/approx_kl             | 0.0026144534 |
|    agent/train/clip_fraction         | 0.129        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.552       |
|    agent/train/explained_variance    | 0.764        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00154      |
|    agent/train/n_updates             | 400          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 206          |
|    agent/rollout/ep_rew_wrapped_mean | 59           |
|    agent/time/fps                    | 4040         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 86016        |
|    agent/train/approx_kl             | 0.0022570014 |
|    agent/train/clip_fraction         | 0.134        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.57        |
|    agent/train/explained_variance    | 0.929        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00509     |
|    agent/train/n_updates             | 410          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 224          |
|    agent/rollout/ep_rew_wrapped_mean | 67           |
|    agent/time/fps                    | 3993         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 88064        |
|    agent/train/approx_kl             | 0.0036588954 |
|    agent/train/clip_fraction         | 0.142        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.54        |
|    agent/train/explained_variance    | 0.961        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00114      |
|    agent/train/n_updates             | 420          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 243          |
|    agent/rollout/ep_rew_wrapped_mean | 76.4         |
|    agent/time/fps                    | 3826         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 90112        |
|    agent/train/approx_kl             | 0.0039247232 |
|    agent/train/clip_fraction         | 0.169        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.515       |
|    agent/train/explained_variance    | 0.872        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0014       |
|    agent/train/n_updates             | 430          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 257          |
|    agent/rollout/ep_rew_wrapped_mean | 88.5         |
|    agent/time/fps                    | 3878         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 92160        |
|    agent/train/approx_kl             | 0.0030763645 |
|    agent/train/clip_fraction         | 0.147        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.5         |
|    agent/train/explained_variance    | 0.835        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0246       |
|    agent/train/n_updates             | 440          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 274         |
|    agent/rollout/ep_rew_wrapped_mean | 101         |
|    agent/time/fps                    | 4080        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 94208       |
|    agent/train/approx_kl             | 0.002933114 |
|    agent/train/clip_fraction         | 0.164       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.514      |
|    agent/train/explained_variance    | 0.928       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0368      |
|    agent/train/n_updates             | 450         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 289          |
|    agent/rollout/ep_rew_wrapped_mean | 120          |
|    agent/time/fps                    | 4059         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 96256        |
|    agent/train/approx_kl             | 0.0034039374 |
|    agent/train/clip_fraction         | 0.12         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.512       |
|    agent/train/explained_variance    | 0.915        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0417       |
|    agent/train/n_updates             | 460          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 302          |
|    agent/rollout/ep_rew_wrapped_mean | 135          |
|    agent/time/fps                    | 3771         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 98304        |
|    agent/train/approx_kl             | 0.0015422567 |
|    agent/train/clip_fraction         | 0.108        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.505       |
|    agent/train/explained_variance    | 0.988        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0265       |
|    agent/train/n_updates             | 470          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 313          |
|    agent/rollout/ep_rew_wrapped_mean | 152          |
|    agent/time/fps                    | 4055         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 100352       |
|    agent/train/approx_kl             | 0.0017519908 |
|    agent/train/clip_fraction         | 0.127        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.494       |
|    agent/train/explained_variance    | 0.983        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0453       |
|    agent/train/n_updates             | 480          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 325         |
|    agent/rollout/ep_rew_wrapped_mean | 169         |
|    agent/time/fps                    | 4168        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 102400      |
|    agent/train/approx_kl             | 0.008000186 |
|    agent/train/clip_fraction         | 0.165       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.507      |
|    agent/train/explained_variance    | 0.994       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00812     |
|    agent/train/n_updates             | 490         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 336         |
|    agent/rollout/ep_rew_wrapped_mean | 185         |
|    agent/time/fps                    | 4133        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 104448      |
|    agent/train/approx_kl             | 0.006288129 |
|    agent/train/clip_fraction         | 0.201       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.473      |
|    agent/train/explained_variance    | 0.994       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.026       |
|    agent/train/n_updates             | 500         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 349          |
|    agent/rollout/ep_rew_wrapped_mean | 205          |
|    agent/time/fps                    | 3761         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 106496       |
|    agent/train/approx_kl             | 0.0049585328 |
|    agent/train/clip_fraction         | 0.219        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.464       |
|    agent/train/explained_variance    | 0.994        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0136      |
|    agent/train/n_updates             | 510          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 360          |
|    agent/rollout/ep_rew_wrapped_mean | 222          |
|    agent/time/fps                    | 4086         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 108544       |
|    agent/train/approx_kl             | 0.0036607736 |
|    agent/train/clip_fraction         | 0.16         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.439       |
|    agent/train/explained_variance    | 0.991        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00673     |
|    agent/train/n_updates             | 520          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 375          |
|    agent/rollout/ep_rew_wrapped_mean | 239          |
|    agent/time/fps                    | 4192         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 110592       |
|    agent/train/approx_kl             | 0.0069155954 |
|    agent/train/clip_fraction         | 0.209        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.399       |
|    agent/train/explained_variance    | 0.929        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00557     |
|    agent/train/n_updates             | 530          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 500        |
|    agent/rollout/ep_rew_mean         | 387        |
|    agent/rollout/ep_rew_wrapped_mean | 249        |
|    agent/time/fps                    | 3916       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 112640     |
|    agent/train/approx_kl             | 0.00744244 |
|    agent/train/clip_fraction         | 0.195      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.381     |
|    agent/train/explained_variance    | 0.726      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.000426   |
|    agent/train/n_updates             | 540        |
|    agent/train/policy_gradient_loss  | -0.00211 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 401          |
|    agent/rollout/ep_rew_wrapped_mean | 262          |
|    agent/time/fps                    | 3499         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 114688       |
|    agent/train/approx_kl             | 0.0032673818 |
|    agent/train/clip_fraction         | 0.153        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.369       |
|    agent/train/explained_variance    | 0.826        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00567     |
|    agent/train/n_updates             | 550          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 412         |
|    agent/rollout/ep_rew_wrapped_mean | 277         |
|    agent/time/fps                    | 4195        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 116736      |
|    agent/train/approx_kl             | 0.008203616 |
|    agent/train/clip_fraction         | 0.195       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.347      |
|    agent/train/explained_variance    | 0.815       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00631    |
|    agent/train/n_updates             | 560         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 425          |
|    agent/rollout/ep_rew_wrapped_mean | 294          |
|    agent/time/fps                    | 3986         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 118784       |
|    agent/train/approx_kl             | 0.0019090069 |
|    agent/train/clip_fraction         | 0.117        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.332       |
|    agent/train/explained_variance    | 0.961        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00248      |
|    agent/train/n_updates             | 570          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 435          |
|    agent/rollout/ep_rew_wrapped_mean | 306          |
|    agent/time/fps                    | 3818         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 120832       |
|    agent/train/approx_kl             | 0.0029009893 |
|    agent/train/clip_fraction         | 0.118        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.309       |
|    agent/train/explained_variance    | 0.961        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00937     |
|    agent/train/n_updates             | 580          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 103 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 439          |
|    agent/rollout/ep_rew_wrapped_mean | 318          |
|    agent/time/fps                    | 3954         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 122880       |
|    agent/train/approx_kl             | 0.0015590945 |
|    agent/train/clip_fraction         | 0.127        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.302       |
|    agent/train/explained_variance    | 0.995        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0039      |
|    agent/train/n_updates             | 590          |
|    agent/trai

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 443         |
|    agent/rollout/ep_rew_wrapped_mean | 332         |
|    agent/time/fps                    | 3839        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 124928      |
|    agent/train/approx_kl             | 0.005272241 |
|    agent/train/clip_fraction         | 0.156       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.289      |
|    agent/train/explained_variance    | 0.998       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00387    |
|    agent/train/n_updates             | 600         |
|    agent/train/policy_gradient_



VBox(children=(Label(value='0.109 MB of 0.109 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▁▁▁▁▁▁▁▂▃▅▇█████████████████████████████
time/fps,█▃▁▂▂▃▃▂▃▃▃▄▃▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄
train/approx_kl,▃▂▃▃▂▂▃▃▄▄▃▅▄▄▂▃▅▂▂▂▄█▂▂▂▃▂▂▁▁▁▁▂▂▂▂▅▁▂▃
train/clip_fraction,▃▇▇▄▂▂█▇███▇▄▇▆▅▅▅▅▄▄▄▅▄▃▄▃▄▃▃▃▃▃▃▄▂▄▁▂▃
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▄▅▅▅▃▃▄▆▇▇▇▇▇▇▇▇▇█▇▇█▇▇██▇█▇█████▇████
train/explained_variance,▁▇█▇▇▇▇█▇▇██████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,500.0
time/fps,2798.0
train/approx_kl,0.004
train/clip_fraction,0.0792
train/clip_range,0.1
train/entropy_loss,-0.1354
train/explained_variance,0.996
train/learning_rate,0.002


 60%|██████████████████████████▍                 | 3/5 [18:52<12:35, 377.83s/it]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011230960177878538, max=1.0…

Query schedule: [50, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5]
Collecting 100 fragments (20000 transitions)
Requested 12000 transitions but only 0 in buffer. Sampling 12000 additional transitions.
Sampling 8000 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 50 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 23.8     |
|    agent/rollout/ep_rew_wrapped_mean | 287      |
|    agent/time/fps                    | 4144     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 23.8     |
|    agent/rollout/ep_rew_wrapped_mean | 287      |
|    agent/time/fps                    | 4.14e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.4         |
|    agent/rollout/ep_rew_wrapped_mean | 229          |
|    agent/time/fps                    | 2325         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0022025295 |
|    agent/train/clip_fraction         | 0.0832       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.691       |
|    agent/train/explained_variance    | 0.52         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0196      |
|    agent/train/n_updates             | 10           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 27.8        |
|    agent/rollout/ep_rew_wrapped_mean | 192         |
|    agent/time/fps                    | 1824        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 6144        |
|    agent/train/approx_kl             | 0.004315187 |
|    agent/train/clip_fraction         | 0.292       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.688      |
|    agent/train/explained_variance    | 0.652       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0133     |
|    agent/train/n_updates             | 20          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 26          |
|    agent/rollout/ep_rew_wrapped_mean | 167         |
|    agent/time/fps                    | 4090        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 8192        |
|    agent/train/approx_kl             | 0.004223745 |
|    agent/train/clip_fraction         | 0.27        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.682      |
|    agent/train/explained_variance    | 0.468       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0059      |
|    agent/train/n_updates             | 30          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.1         |
|    agent/rollout/ep_rew_wrapped_mean | 148          |
|    agent/time/fps                    | 4441         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0030879546 |
|    agent/train/clip_fraction         | 0.206        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.681       |
|    agent/train/explained_variance    | 0.694        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0153      |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.5         |
|    agent/rollout/ep_rew_wrapped_mean | 134          |
|    agent/time/fps                    | 4225         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0038308697 |
|    agent/train/clip_fraction         | 0.255        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.673       |
|    agent/train/explained_variance    | 0.76         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0204      |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 23.3        |
|    agent/rollout/ep_rew_wrapped_mean | 123         |
|    agent/time/fps                    | 4350        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 14336       |
|    agent/train/approx_kl             | 0.002576344 |
|    agent/train/clip_fraction         | 0.124       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.665      |
|    agent/train/explained_variance    | 0.825       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00555    |
|    agent/train/n_updates             | 60          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.6         |
|    agent/rollout/ep_rew_wrapped_mean | 105          |
|    agent/time/fps                    | 4483         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0035337522 |
|    agent/train/clip_fraction         | 0.235        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.668       |
|    agent/train/explained_variance    | 0.817        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.000563    |
|    agent/train/n_updates             | 70           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.2         |
|    agent/rollout/ep_rew_wrapped_mean | 82.1         |
|    agent/time/fps                    | 4386         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0038078707 |
|    agent/train/clip_fraction         | 0.231        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.651       |
|    agent/train/explained_variance    | 0.638        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00219     |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 20.8         |
|    agent/rollout/ep_rew_wrapped_mean | 61.8         |
|    agent/time/fps                    | 4078         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0034421165 |
|    agent/train/clip_fraction         | 0.165        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.639       |
|    agent/train/explained_variance    | 0.367        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0109      |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 19.7        |
|    agent/rollout/ep_rew_wrapped_mean | 43.1        |
|    agent/time/fps                    | 4059        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 22528       |
|    agent/train/approx_kl             | 0.001910554 |
|    agent/train/clip_fraction         | 0.105       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.626      |
|    agent/train/explained_variance    | 0.719       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00277    |
|    agent/train/n_updates             | 100         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 18.9         |
|    agent/rollout/ep_rew_wrapped_mean | 13.2         |
|    agent/time/fps                    | 4362         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0053868713 |
|    agent/train/clip_fraction         | 0.237        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.628       |
|    agent/train/explained_variance    | 0.857        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.012       |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 18.3        |
|    agent/rollout/ep_rew_wrapped_mean | 13.7        |
|    agent/time/fps                    | 4343        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 26624       |
|    agent/train/approx_kl             | 0.002812548 |
|    agent/train/clip_fraction         | 0.137       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.609      |
|    agent/train/explained_variance    | 0.699       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00999     |
|    agent/train/n_updates             | 120         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17.9         |
|    agent/rollout/ep_rew_wrapped_mean | 15.9         |
|    agent/time/fps                    | 3541         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 28672        |
|    agent/train/approx_kl             | 0.0032833098 |
|    agent/train/clip_fraction         | 0.166        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.61        |
|    agent/train/explained_variance    | 0.92         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00692     |
|    agent/train/n_updates             | 130          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 17.3         |
|    agent/rollout/ep_rew_wrapped_mean | 18.2         |
|    agent/time/fps                    | 4261         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0030178456 |
|    agent/train/clip_fraction         | 0.185        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.618       |
|    agent/train/explained_variance    | 0.95         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0189      |
|    agent/train/n_updates             | 140          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.8         |
|    agent/rollout/ep_rew_wrapped_mean | 20.5         |
|    agent/time/fps                    | 4183         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0033813445 |
|    agent/train/clip_fraction         | 0.21         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.61        |
|    agent/train/explained_variance    | 0.96         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0156      |
|    agent/train/n_updates             | 150          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.7         |
|    agent/rollout/ep_rew_wrapped_mean | 23           |
|    agent/time/fps                    | 4237         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0027135047 |
|    agent/train/clip_fraction         | 0.156        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.578       |
|    agent/train/explained_variance    | 0.868        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0169      |
|    agent/train/n_updates             | 160          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.5         |
|    agent/rollout/ep_rew_wrapped_mean | 24           |
|    agent/time/fps                    | 4255         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 36864        |
|    agent/train/approx_kl             | 0.0021885226 |
|    agent/train/clip_fraction         | 0.098        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.562       |
|    agent/train/explained_variance    | 0.809        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00301     |
|    agent/train/n_updates             | 170          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 16.1         |
|    agent/rollout/ep_rew_wrapped_mean | 24.2         |
|    agent/time/fps                    | 3960         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 38912        |
|    agent/train/approx_kl             | 0.0026802314 |
|    agent/train/clip_fraction         | 0.0874       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.552       |
|    agent/train/explained_variance    | 0.86         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00035      |
|    agent/train/n_updates             | 180          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 15.7        |
|    agent/rollout/ep_rew_wrapped_mean | 24.4        |
|    agent/time/fps                    | 3850        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 40960       |
|    agent/train/approx_kl             | 0.003011764 |
|    agent/train/clip_fraction         | 0.135       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.562      |
|    agent/train/explained_variance    | 0.89        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0131     |
|    agent/train/n_updates             | 190         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15.4         |
|    agent/rollout/ep_rew_wrapped_mean | 24.1         |
|    agent/time/fps                    | 4054         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0049263453 |
|    agent/train/clip_fraction         | 0.177        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.546       |
|    agent/train/explained_variance    | 0.912        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0141      |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 15           |
|    agent/rollout/ep_rew_wrapped_mean | 24           |
|    agent/time/fps                    | 4308         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 45056        |
|    agent/train/approx_kl             | 0.0039697466 |
|    agent/train/clip_fraction         | 0.146        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.484       |
|    agent/train/explained_variance    | 0.944        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0256      |
|    agent/train/n_updates             | 210          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.8         |
|    agent/rollout/ep_rew_wrapped_mean | 23.6         |
|    agent/time/fps                    | 4237         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 47104        |
|    agent/train/approx_kl             | 0.0035328963 |
|    agent/train/clip_fraction         | 0.188        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.516       |
|    agent/train/explained_variance    | 0.939        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0151      |
|    agent/train/n_updates             | 220          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 14.5        |
|    agent/rollout/ep_rew_wrapped_mean | 22.8        |
|    agent/time/fps                    | 4077        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 49152       |
|    agent/train/approx_kl             | 0.004056601 |
|    agent/train/clip_fraction         | 0.191       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.495      |
|    agent/train/explained_variance    | 0.958       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00624    |
|    agent/train/n_updates             | 230         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 14.3         |
|    agent/rollout/ep_rew_wrapped_mean | 21.9         |
|    agent/time/fps                    | 4257         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 51200        |
|    agent/train/approx_kl             | 0.0033050813 |
|    agent/train/clip_fraction         | 0.114        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.48        |
|    agent/train/explained_variance    | 0.93         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0175      |
|    agent/train/n_updates             | 240          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 13.7         |
|    agent/rollout/ep_rew_wrapped_mean | 20.9         |
|    agent/time/fps                    | 4273         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 53248        |
|    agent/train/approx_kl             | 0.0019458162 |
|    agent/train/clip_fraction         | 0.102        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.469       |
|    agent/train/explained_variance    | 0.961        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0127      |
|    agent/train/n_updates             | 250          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 13.1         |
|    agent/rollout/ep_rew_wrapped_mean | 20           |
|    agent/time/fps                    | 4071         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 55296        |
|    agent/train/approx_kl             | 0.0036061164 |
|    agent/train/clip_fraction         | 0.127        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.44        |
|    agent/train/explained_variance    | 0.952        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0113      |
|    agent/train/n_updates             | 260          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 11.9         |
|    agent/rollout/ep_rew_wrapped_mean | 18.4         |
|    agent/time/fps                    | 3032         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 57344        |
|    agent/train/approx_kl             | 0.0024308967 |
|    agent/train/clip_fraction         | 0.107        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.445       |
|    agent/train/explained_variance    | 0.947        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0163      |
|    agent/train/n_updates             | 270          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 11.4        |
|    agent/rollout/ep_rew_wrapped_mean | 16.5        |
|    agent/time/fps                    | 4149        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 59392       |
|    agent/train/approx_kl             | 0.003055037 |
|    agent/train/clip_fraction         | 0.144       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.437      |
|    agent/train/explained_variance    | 0.935       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0112     |
|    agent/train/n_updates             | 280         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 10.5         |
|    agent/rollout/ep_rew_wrapped_mean | 14           |
|    agent/time/fps                    | 3844         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 61440        |
|    agent/train/approx_kl             | 0.0019610452 |
|    agent/train/clip_fraction         | 0.114        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.433       |
|    agent/train/explained_variance    | 0.972        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0047      |
|    agent/train/n_updates             | 290          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 10.2        |
|    agent/rollout/ep_rew_wrapped_mean | 11.6        |
|    agent/time/fps                    | 4181        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 63488       |
|    agent/train/approx_kl             | 0.003586132 |
|    agent/train/clip_fraction         | 0.14        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.428      |
|    agent/train/explained_variance    | 0.968       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00733    |
|    agent/train/n_updates             | 300         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 10.1         |
|    agent/rollout/ep_rew_wrapped_mean | 9.45         |
|    agent/time/fps                    | 4173         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 65536        |
|    agent/train/approx_kl             | 0.0027510882 |
|    agent/train/clip_fraction         | 0.121        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.402       |
|    agent/train/explained_variance    | 0.958        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.018       |
|    agent/train/n_updates             | 310          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 9.7         |
|    agent/rollout/ep_rew_wrapped_mean | 7.47        |
|    agent/time/fps                    | 4236        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 67584       |
|    agent/train/approx_kl             | 0.003456255 |
|    agent/train/clip_fraction         | 0.148       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.406      |
|    agent/train/explained_variance    | 0.95        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00528     |
|    agent/train/n_updates             | 320         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 9.64         |
|    agent/rollout/ep_rew_wrapped_mean | 5.25         |
|    agent/time/fps                    | 3972         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 69632        |
|    agent/train/approx_kl             | 0.0030217858 |
|    agent/train/clip_fraction         | 0.112        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.371       |
|    agent/train/explained_variance    | 0.958        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0109      |
|    agent/train/n_updates             | 330          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 9.3          |
|    agent/rollout/ep_rew_wrapped_mean | 2.91         |
|    agent/time/fps                    | 4014         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 71680        |
|    agent/train/approx_kl             | 0.0029371749 |
|    agent/train/clip_fraction         | 0.115        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.372       |
|    agent/train/explained_variance    | 0.953        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0104      |
|    agent/train/n_updates             | 340          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 9.3          |
|    agent/rollout/ep_rew_wrapped_mean | 0.127        |
|    agent/time/fps                    | 3750         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 73728        |
|    agent/train/approx_kl             | 0.0025129158 |
|    agent/train/clip_fraction         | 0.127        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.34        |
|    agent/train/explained_variance    | 0.966        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00824      |
|    agent/train/n_updates             | 350          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 9.22         |
|    agent/rollout/ep_rew_wrapped_mean | -2.88        |
|    agent/time/fps                    | 4115         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 75776        |
|    agent/train/approx_kl             | 0.0033093472 |
|    agent/train/clip_fraction         | 0.125        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.361       |
|    agent/train/explained_variance    | 0.971        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0194      |
|    agent/train/n_updates             | 360          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 9.12        |
|    agent/rollout/ep_rew_wrapped_mean | -5.17       |
|    agent/time/fps                    | 4195        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 77824       |
|    agent/train/approx_kl             | 0.003322028 |
|    agent/train/clip_fraction         | 0.119       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.321      |
|    agent/train/explained_variance    | 0.978       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0137     |
|    agent/train/n_updates             | 370         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.93         |
|    agent/rollout/ep_rew_wrapped_mean | -7.62        |
|    agent/time/fps                    | 4026         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 79872        |
|    agent/train/approx_kl             | 0.0026963805 |
|    agent/train/clip_fraction         | 0.135        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.369       |
|    agent/train/explained_variance    | 0.975        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0165      |
|    agent/train/n_updates             | 380          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.88         |
|    agent/rollout/ep_rew_wrapped_mean | -10.2        |
|    agent/time/fps                    | 4018         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 81920        |
|    agent/train/approx_kl             | 0.0033389889 |
|    agent/train/clip_fraction         | 0.101        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.345       |
|    agent/train/explained_variance    | 0.974        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00605      |
|    agent/train/n_updates             | 390          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.88         |
|    agent/rollout/ep_rew_wrapped_mean | -13.2        |
|    agent/time/fps                    | 4012         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 83968        |
|    agent/train/approx_kl             | 0.0050172717 |
|    agent/train/clip_fraction         | 0.156        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.352       |
|    agent/train/explained_variance    | 0.972        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00405     |
|    agent/train/n_updates             | 400          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.57         |
|    agent/rollout/ep_rew_wrapped_mean | -15.9        |
|    agent/time/fps                    | 4067         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 86016        |
|    agent/train/approx_kl             | 0.0023862198 |
|    agent/train/clip_fraction         | 0.113        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.319       |
|    agent/train/explained_variance    | 0.96         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00782     |
|    agent/train/n_updates             | 410          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.33         |
|    agent/rollout/ep_rew_wrapped_mean | -18.3        |
|    agent/time/fps                    | 3996         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 88064        |
|    agent/train/approx_kl             | 0.0028758252 |
|    agent/train/clip_fraction         | 0.101        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.33        |
|    agent/train/explained_variance    | 0.969        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.000546    |
|    agent/train/n_updates             | 420          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.35         |
|    agent/rollout/ep_rew_wrapped_mean | -21          |
|    agent/time/fps                    | 3937         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 90112        |
|    agent/train/approx_kl             | 0.0031007628 |
|    agent/train/clip_fraction         | 0.106        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.322       |
|    agent/train/explained_variance    | 0.965        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.000102    |
|    agent/train/n_updates             | 430          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.34         |
|    agent/rollout/ep_rew_wrapped_mean | -23.9        |
|    agent/time/fps                    | 4127         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 92160        |
|    agent/train/approx_kl             | 0.0035710195 |
|    agent/train/clip_fraction         | 0.119        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.311       |
|    agent/train/explained_variance    | 0.958        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00731     |
|    agent/train/n_updates             | 440          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.37         |
|    agent/rollout/ep_rew_wrapped_mean | -26.8        |
|    agent/time/fps                    | 4038         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 94208        |
|    agent/train/approx_kl             | 0.0030310126 |
|    agent/train/clip_fraction         | 0.0859       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.299       |
|    agent/train/explained_variance    | 0.961        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0148      |
|    agent/train/n_updates             | 450          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.41         |
|    agent/rollout/ep_rew_wrapped_mean | -29.5        |
|    agent/time/fps                    | 3922         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 96256        |
|    agent/train/approx_kl             | 0.0019994369 |
|    agent/train/clip_fraction         | 0.0895       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.298       |
|    agent/train/explained_variance    | 0.97         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00158      |
|    agent/train/n_updates             | 460          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.43         |
|    agent/rollout/ep_rew_wrapped_mean | -31.7        |
|    agent/time/fps                    | 4086         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 98304        |
|    agent/train/approx_kl             | 0.0016673091 |
|    agent/train/clip_fraction         | 0.0897       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.303       |
|    agent/train/explained_variance    | 0.967        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0017      |
|    agent/train/n_updates             | 470          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.38         |
|    agent/rollout/ep_rew_wrapped_mean | -33.6        |
|    agent/time/fps                    | 4118         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 100352       |
|    agent/train/approx_kl             | 0.0022135084 |
|    agent/train/clip_fraction         | 0.104        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.313       |
|    agent/train/explained_variance    | 0.876        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.018        |
|    agent/train/n_updates             | 480          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.39         |
|    agent/rollout/ep_rew_wrapped_mean | -34.8        |
|    agent/time/fps                    | 4011         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 102400       |
|    agent/train/approx_kl             | 0.0040220674 |
|    agent/train/clip_fraction         | 0.113        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.305       |
|    agent/train/explained_variance    | 0.954        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00575     |
|    agent/train/n_updates             | 490          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.39         |
|    agent/rollout/ep_rew_wrapped_mean | -34.5        |
|    agent/time/fps                    | 4281         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 104448       |
|    agent/train/approx_kl             | 0.0038559996 |
|    agent/train/clip_fraction         | 0.124        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.325       |
|    agent/train/explained_variance    | 0.947        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00757     |
|    agent/train/n_updates             | 500          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.39         |
|    agent/rollout/ep_rew_wrapped_mean | -34.3        |
|    agent/time/fps                    | 4159         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 106496       |
|    agent/train/approx_kl             | 0.0049486607 |
|    agent/train/clip_fraction         | 0.156        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.339       |
|    agent/train/explained_variance    | 0.968        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0553       |
|    agent/train/n_updates             | 510          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.4          |
|    agent/rollout/ep_rew_wrapped_mean | -33.3        |
|    agent/time/fps                    | 4011         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 108544       |
|    agent/train/approx_kl             | 0.0048154034 |
|    agent/train/clip_fraction         | 0.127        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.35        |
|    agent/train/explained_variance    | 0.968        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00892      |
|    agent/train/n_updates             | 520          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.38         |
|    agent/rollout/ep_rew_wrapped_mean | -33.1        |
|    agent/time/fps                    | 3911         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 110592       |
|    agent/train/approx_kl             | 0.0035564718 |
|    agent/train/clip_fraction         | 0.142        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.348       |
|    agent/train/explained_variance    | 0.971        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00151      |
|    agent/train/n_updates             | 530          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.36        |
|    agent/rollout/ep_rew_wrapped_mean | -32.3       |
|    agent/time/fps                    | 4182        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 112640      |
|    agent/train/approx_kl             | 0.004560022 |
|    agent/train/clip_fraction         | 0.152       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.369      |
|    agent/train/explained_variance    | 0.974       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00815    |
|    agent/train/n_updates             | 540         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.34         |
|    agent/rollout/ep_rew_wrapped_mean | -31.3        |
|    agent/time/fps                    | 3952         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 114688       |
|    agent/train/approx_kl             | 0.0036684608 |
|    agent/train/clip_fraction         | 0.107        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.343       |
|    agent/train/explained_variance    | 0.974        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00659     |
|    agent/train/n_updates             | 550          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.34        |
|    agent/rollout/ep_rew_wrapped_mean | -29.8       |
|    agent/time/fps                    | 3821        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 116736      |
|    agent/train/approx_kl             | 0.004099927 |
|    agent/train/clip_fraction         | 0.135       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.342      |
|    agent/train/explained_variance    | 0.958       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0112     |
|    agent/train/n_updates             | 560         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.38         |
|    agent/rollout/ep_rew_wrapped_mean | -28          |
|    agent/time/fps                    | 3731         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 118784       |
|    agent/train/approx_kl             | 0.0029494776 |
|    agent/train/clip_fraction         | 0.109        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.339       |
|    agent/train/explained_variance    | 0.976        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00227      |
|    agent/train/n_updates             | 570          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.38        |
|    agent/rollout/ep_rew_wrapped_mean | -25.8       |
|    agent/time/fps                    | 4086        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 120832      |
|    agent/train/approx_kl             | 0.008213507 |
|    agent/train/clip_fraction         | 0.146       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.334      |
|    agent/train/explained_variance    | 0.974       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00745    |
|    agent/train/n_updates             | 580         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 103 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 8.43        |
|    agent/rollout/ep_rew_wrapped_mean | -23.4       |
|    agent/time/fps                    | 3658        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 122880      |
|    agent/train/approx_kl             | 0.004345351 |
|    agent/train/clip_fraction         | 0.133       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.341      |
|    agent/train/explained_variance    | 0.979       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0179     |
|    agent/train/n_updates             | 590         |
|    agent/train/policy_gradient

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 8.44         |
|    agent/rollout/ep_rew_wrapped_mean | -20.5        |
|    agent/time/fps                    | 3995         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 124928       |
|    agent/train/approx_kl             | 0.0033539492 |
|    agent/train/clip_fraction         | 0.12         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.342       |
|    agent/train/explained_variance    | 0.977        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00651      |
|    agent/train/n_updates             | 600          |
|    agent/train



VBox(children=(Label(value='0.110 MB of 0.110 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▁▁▁▁▁▂▃▃▄▅▅▅▅▅▅▅▆▅▅▆▆▆▆▆▇▇██▇▆▅▅▆▆▆▆▆▆▇▇
time/fps,█▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/approx_kl,▂▃▃▂▂▁▂▄▂▂▃▄▂▃▃▂▂▂▄▂▂▁▃▅▂█▄▃▃▁▂▃▂▁▂▂▃▅▃▃
train/clip_fraction,▂▆▃▁▃▂▇▂▄▆▆▄▄█▃▃▅▄▄▃▁▂▇▆▂▅▃▆▃▂▂▃▁▁▄▃▄▆▇▄
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▃▆▆▇▇▃▇▆▅▅▆▇▅▆▇▆▇▅▆▇▇▆▅▇▇▇▇█▇▆▇▇▇▆▇▇▆▄▅
train/explained_variance,▁███████████████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,189.21001
time/fps,2817.0
train/approx_kl,0.00701
train/clip_fraction,0.15771
train/clip_range,0.1
train/entropy_loss,-0.3873
train/explained_variance,0.99979
train/learning_rate,0.002


 80%|███████████████████████████████████▏        | 4/5 [25:07<06:16, 376.46s/it]

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011212541199185782, max=1.0…

Query schedule: [50, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5]
Collecting 100 fragments (20000 transitions)
Requested 12000 transitions but only 0 in buffer. Sampling 12000 additional transitions.
Sampling 8000 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 50 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 83 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 37.5     |
|    agent/rollout/ep_rew_wrapped_mean | 16.6     |
|    agent/time/fps                    | 4226     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_len_mean         | 500      |
|    agent/rollout/ep_rew_mean         | 37.5     |
|    agent/rollout/ep_rew_wrapped_mean | 16.6     |
|    agent/time/fps                    | 4.23e+03 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps 

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 33.1        |
|    agent/rollout/ep_rew_wrapped_mean | 3.93        |
|    agent/time/fps                    | 4266        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 4096        |
|    agent/train/approx_kl             | 0.003076303 |
|    agent/train/clip_fraction         | 0.137       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.691      |
|    agent/train/explained_variance    | 0.244       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00505    |
|    agent/train/n_updates             | 10          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 31.4        |
|    agent/rollout/ep_rew_wrapped_mean | -4.38       |
|    agent/time/fps                    | 4395        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 6144        |
|    agent/train/approx_kl             | 0.002128439 |
|    agent/train/clip_fraction         | 0.0898      |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.688      |
|    agent/train/explained_variance    | -0.0648     |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0175      |
|    agent/train/n_updates             | 20          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.1         |
|    agent/rollout/ep_rew_wrapped_mean | -9.65        |
|    agent/time/fps                    | 4030         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0024094093 |
|    agent/train/clip_fraction         | 0.141        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.68        |
|    agent/train/explained_variance    | 0.137        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0218      |
|    agent/train/n_updates             | 30           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.4         |
|    agent/rollout/ep_rew_wrapped_mean | -14.3        |
|    agent/time/fps                    | 3494         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0038145324 |
|    agent/train/clip_fraction         | 0.234        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.668       |
|    agent/train/explained_variance    | 0.39         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0129      |
|    agent/train/n_updates             | 40           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.2         |
|    agent/rollout/ep_rew_wrapped_mean | -18.3        |
|    agent/time/fps                    | 4357         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0046500815 |
|    agent/train/clip_fraction         | 0.237        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.653       |
|    agent/train/explained_variance    | 0.391        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00241     |
|    agent/train/n_updates             | 50           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.9         |
|    agent/rollout/ep_rew_wrapped_mean | -21.5        |
|    agent/time/fps                    | 4328         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 14336        |
|    agent/train/approx_kl             | 0.0037401614 |
|    agent/train/clip_fraction         | 0.253        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.637       |
|    agent/train/explained_variance    | 0.781        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0103      |
|    agent/train/n_updates             | 60           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 22          |
|    agent/rollout/ep_rew_wrapped_mean | -27.4       |
|    agent/time/fps                    | 4238        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 16384       |
|    agent/train/approx_kl             | 0.004863657 |
|    agent/train/clip_fraction         | 0.255       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.619      |
|    agent/train/explained_variance    | 0.716       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0187     |
|    agent/train/n_updates             | 70          |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.4         |
|    agent/rollout/ep_rew_wrapped_mean | -26.3        |
|    agent/time/fps                    | 3719         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0033644265 |
|    agent/train/clip_fraction         | 0.228        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.599       |
|    agent/train/explained_variance    | 0.812        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00629     |
|    agent/train/n_updates             | 80           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.2         |
|    agent/rollout/ep_rew_wrapped_mean | -34.7        |
|    agent/time/fps                    | 3741         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0048169475 |
|    agent/train/clip_fraction         | 0.188        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.57        |
|    agent/train/explained_variance    | 0.883        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0152      |
|    agent/train/n_updates             | 90           |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.3         |
|    agent/rollout/ep_rew_wrapped_mean | -39.1        |
|    agent/time/fps                    | 3575         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0062752934 |
|    agent/train/clip_fraction         | 0.271        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.565       |
|    agent/train/explained_variance    | 0.817        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0186      |
|    agent/train/n_updates             | 100          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 21.6         |
|    agent/rollout/ep_rew_wrapped_mean | -48.8        |
|    agent/time/fps                    | 3534         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 24576        |
|    agent/train/approx_kl             | 0.0060805073 |
|    agent/train/clip_fraction         | 0.277        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.553       |
|    agent/train/explained_variance    | 0.87         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00441     |
|    agent/train/n_updates             | 110          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 22.2         |
|    agent/rollout/ep_rew_wrapped_mean | -48          |
|    agent/time/fps                    | 1944         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 1            |
|    agent/time/total_timesteps        | 26624        |
|    agent/train/approx_kl             | 0.0036729893 |
|    agent/train/clip_fraction         | 0.155        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.532       |
|    agent/train/explained_variance    | 0.112        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0166       |
|    agent/train/n_updates             | 120          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 23.3         |
|    agent/rollout/ep_rew_wrapped_mean | -46.6        |
|    agent/time/fps                    | 3251         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 28672        |
|    agent/train/approx_kl             | 0.0036540646 |
|    agent/train/clip_fraction         | 0.207        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.53        |
|    agent/train/explained_variance    | 0.814        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0119      |
|    agent/train/n_updates             | 130          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24           |
|    agent/rollout/ep_rew_wrapped_mean | -45.9        |
|    agent/time/fps                    | 4165         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0031690495 |
|    agent/train/clip_fraction         | 0.158        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.569       |
|    agent/train/explained_variance    | 0.894        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00741     |
|    agent/train/n_updates             | 140          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 24.5         |
|    agent/rollout/ep_rew_wrapped_mean | -44.4        |
|    agent/time/fps                    | 3934         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0035449266 |
|    agent/train/clip_fraction         | 0.167        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.516       |
|    agent/train/explained_variance    | 0.909        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.011        |
|    agent/train/n_updates             | 150          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 24.6        |
|    agent/rollout/ep_rew_wrapped_mean | -42.2       |
|    agent/time/fps                    | 4101        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 34816       |
|    agent/train/approx_kl             | 0.004287687 |
|    agent/train/clip_fraction         | 0.118       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.474      |
|    agent/train/explained_variance    | 0.219       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.000496   |
|    agent/train/n_updates             | 160         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 25.2         |
|    agent/rollout/ep_rew_wrapped_mean | -39.7        |
|    agent/time/fps                    | 4240         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 36864        |
|    agent/train/approx_kl             | 0.0036563894 |
|    agent/train/clip_fraction         | 0.105        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.441       |
|    agent/train/explained_variance    | 0.416        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.013       |
|    agent/train/n_updates             | 170          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 26.4        |
|    agent/rollout/ep_rew_wrapped_mean | -36.8       |
|    agent/time/fps                    | 3883        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 38912       |
|    agent/train/approx_kl             | 0.003681459 |
|    agent/train/clip_fraction         | 0.109       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.419      |
|    agent/train/explained_variance    | 0.761       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0178      |
|    agent/train/n_updates             | 180         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 27.1        |
|    agent/rollout/ep_rew_wrapped_mean | -34.4       |
|    agent/time/fps                    | 3559        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 40960       |
|    agent/train/approx_kl             | 0.002116756 |
|    agent/train/clip_fraction         | 0.119       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.422      |
|    agent/train/explained_variance    | 0.618       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00502     |
|    agent/train/n_updates             | 190         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 27.5         |
|    agent/rollout/ep_rew_wrapped_mean | -31.5        |
|    agent/time/fps                    | 3588         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 43008        |
|    agent/train/approx_kl             | 0.0025267377 |
|    agent/train/clip_fraction         | 0.0939       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.388       |
|    agent/train/explained_variance    | 0.841        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00288     |
|    agent/train/n_updates             | 200          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 28.1        |
|    agent/rollout/ep_rew_wrapped_mean | -28.4       |
|    agent/time/fps                    | 3553        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 45056       |
|    agent/train/approx_kl             | 0.003576827 |
|    agent/train/clip_fraction         | 0.108       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.401      |
|    agent/train/explained_variance    | 0.863       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0151     |
|    agent/train/n_updates             | 210         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 29.2         |
|    agent/rollout/ep_rew_wrapped_mean | -24.7        |
|    agent/time/fps                    | 3564         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 47104        |
|    agent/train/approx_kl             | 0.0024634139 |
|    agent/train/clip_fraction         | 0.102        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.371       |
|    agent/train/explained_variance    | 0.881        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0135       |
|    agent/train/n_updates             | 220          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 29.7        |
|    agent/rollout/ep_rew_wrapped_mean | -21         |
|    agent/time/fps                    | 3603        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 49152       |
|    agent/train/approx_kl             | 0.004358015 |
|    agent/train/clip_fraction         | 0.133       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.345      |
|    agent/train/explained_variance    | 0.903       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0118     |
|    agent/train/n_updates             | 230         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 30.1        |
|    agent/rollout/ep_rew_wrapped_mean | -17.6       |
|    agent/time/fps                    | 3535        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 51200       |
|    agent/train/approx_kl             | 0.004012108 |
|    agent/train/clip_fraction         | 0.149       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.401      |
|    agent/train/explained_variance    | 0.876       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.000861   |
|    agent/train/n_updates             | 240         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 32.4         |
|    agent/rollout/ep_rew_wrapped_mean | -14.5        |
|    agent/time/fps                    | 3782         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 53248        |
|    agent/train/approx_kl             | 0.0043165423 |
|    agent/train/clip_fraction         | 0.16         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.34        |
|    agent/train/explained_variance    | 0.943        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0102      |
|    agent/train/n_updates             | 250          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 33.8         |
|    agent/rollout/ep_rew_wrapped_mean | -11.1        |
|    agent/time/fps                    | 3760         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 55296        |
|    agent/train/approx_kl             | 0.0038744854 |
|    agent/train/clip_fraction         | 0.143        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.341       |
|    agent/train/explained_variance    | 0.922        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00681     |
|    agent/train/n_updates             | 260          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 35.3         |
|    agent/rollout/ep_rew_wrapped_mean | -8.25        |
|    agent/time/fps                    | 3481         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 57344        |
|    agent/train/approx_kl             | 0.0050398996 |
|    agent/train/clip_fraction         | 0.15         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.387       |
|    agent/train/explained_variance    | 0.859        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00154     |
|    agent/train/n_updates             | 270          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 37.7         |
|    agent/rollout/ep_rew_wrapped_mean | -4.55        |
|    agent/time/fps                    | 3614         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 59392        |
|    agent/train/approx_kl             | 0.0040767197 |
|    agent/train/clip_fraction         | 0.179        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.472       |
|    agent/train/explained_variance    | 0.886        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00136     |
|    agent/train/n_updates             | 280          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 40           |
|    agent/rollout/ep_rew_wrapped_mean | -1.72        |
|    agent/time/fps                    | 3437         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 61440        |
|    agent/train/approx_kl             | 0.0035848392 |
|    agent/train/clip_fraction         | 0.134        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.44        |
|    agent/train/explained_variance    | 0.901        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0189       |
|    agent/train/n_updates             | 290          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 43.1         |
|    agent/rollout/ep_rew_wrapped_mean | -0.0132      |
|    agent/time/fps                    | 3748         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 63488        |
|    agent/train/approx_kl             | 0.0035369191 |
|    agent/train/clip_fraction         | 0.15         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.466       |
|    agent/train/explained_variance    | 0.967        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00415     |
|    agent/train/n_updates             | 300          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 47.1         |
|    agent/rollout/ep_rew_wrapped_mean | 2.64         |
|    agent/time/fps                    | 3668         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 65536        |
|    agent/train/approx_kl             | 0.0026366916 |
|    agent/train/clip_fraction         | 0.131        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.395       |
|    agent/train/explained_variance    | 0.974        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00878     |
|    agent/train/n_updates             | 310          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 52.9         |
|    agent/rollout/ep_rew_wrapped_mean | 4.56         |
|    agent/time/fps                    | 3449         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 67584        |
|    agent/train/approx_kl             | 0.0024683103 |
|    agent/train/clip_fraction         | 0.117        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.338       |
|    agent/train/explained_variance    | 0.943        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00662      |
|    agent/train/n_updates             | 320          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 57          |
|    agent/rollout/ep_rew_wrapped_mean | 7.45        |
|    agent/time/fps                    | 3483        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 69632       |
|    agent/train/approx_kl             | 0.004172721 |
|    agent/train/clip_fraction         | 0.141       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.404      |
|    agent/train/explained_variance    | 0.925       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0196      |
|    agent/train/n_updates             | 330         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 61.4        |
|    agent/rollout/ep_rew_wrapped_mean | 9.49        |
|    agent/time/fps                    | 2560        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 71680       |
|    agent/train/approx_kl             | 0.002563006 |
|    agent/train/clip_fraction         | 0.111       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.387      |
|    agent/train/explained_variance    | 0.94        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0146      |
|    agent/train/n_updates             | 340         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 66          |
|    agent/rollout/ep_rew_wrapped_mean | 10.8        |
|    agent/time/fps                    | 3850        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 73728       |
|    agent/train/approx_kl             | 0.008709302 |
|    agent/train/clip_fraction         | 0.19        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.39       |
|    agent/train/explained_variance    | 0.935       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0108     |
|    agent/train/n_updates             | 350         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 73.2         |
|    agent/rollout/ep_rew_wrapped_mean | 11.4         |
|    agent/time/fps                    | 3297         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 75776        |
|    agent/train/approx_kl             | 0.0032973287 |
|    agent/train/clip_fraction         | 0.137        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.459       |
|    agent/train/explained_variance    | 0.859        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.026        |
|    agent/train/n_updates             | 360          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 81.2         |
|    agent/rollout/ep_rew_wrapped_mean | 12.4         |
|    agent/time/fps                    | 3781         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 77824        |
|    agent/train/approx_kl             | 0.0037742788 |
|    agent/train/clip_fraction         | 0.174        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.486       |
|    agent/train/explained_variance    | 0.974        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0258       |
|    agent/train/n_updates             | 370          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 91.9        |
|    agent/rollout/ep_rew_wrapped_mean | 14.9        |
|    agent/time/fps                    | 3482        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 79872       |
|    agent/train/approx_kl             | 0.004302296 |
|    agent/train/clip_fraction         | 0.189       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.49       |
|    agent/train/explained_variance    | 0.987       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00466    |
|    agent/train/n_updates             | 380         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 103          |
|    agent/rollout/ep_rew_wrapped_mean | 18.5         |
|    agent/time/fps                    | 3658         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 81920        |
|    agent/train/approx_kl             | 0.0055024056 |
|    agent/train/clip_fraction         | 0.189        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.511       |
|    agent/train/explained_variance    | 0.986        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00785      |
|    agent/train/n_updates             | 390          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 112          |
|    agent/rollout/ep_rew_wrapped_mean | 23.1         |
|    agent/time/fps                    | 3703         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 83968        |
|    agent/train/approx_kl             | 0.0030656795 |
|    agent/train/clip_fraction         | 0.136        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.53        |
|    agent/train/explained_variance    | 0.961        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0015      |
|    agent/train/n_updates             | 400          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 119         |
|    agent/rollout/ep_rew_wrapped_mean | 27          |
|    agent/time/fps                    | 3558        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 86016       |
|    agent/train/approx_kl             | 0.006690376 |
|    agent/train/clip_fraction         | 0.202       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.541      |
|    agent/train/explained_variance    | 0.97        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0347      |
|    agent/train/n_updates             | 410         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 132         |
|    agent/rollout/ep_rew_wrapped_mean | 31.5        |
|    agent/time/fps                    | 3570        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 88064       |
|    agent/train/approx_kl             | 0.003206924 |
|    agent/train/clip_fraction         | 0.15        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.534      |
|    agent/train/explained_variance    | 0.965       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0233      |
|    agent/train/n_updates             | 420         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 145         |
|    agent/rollout/ep_rew_wrapped_mean | 38.7        |
|    agent/time/fps                    | 3464        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 90112       |
|    agent/train/approx_kl             | 0.003097809 |
|    agent/train/clip_fraction         | 0.131       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.546      |
|    agent/train/explained_variance    | 0.962       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0883      |
|    agent/train/n_updates             | 430         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 157          |
|    agent/rollout/ep_rew_wrapped_mean | 46.6         |
|    agent/time/fps                    | 3724         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 92160        |
|    agent/train/approx_kl             | 0.0032395942 |
|    agent/train/clip_fraction         | 0.162        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.54        |
|    agent/train/explained_variance    | 0.976        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0536       |
|    agent/train/n_updates             | 440          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 170         |
|    agent/rollout/ep_rew_wrapped_mean | 57          |
|    agent/time/fps                    | 3772        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 94208       |
|    agent/train/approx_kl             | 0.006663169 |
|    agent/train/clip_fraction         | 0.209       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.53       |
|    agent/train/explained_variance    | 0.988       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0149      |
|    agent/train/n_updates             | 450         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 187         |
|    agent/rollout/ep_rew_wrapped_mean | 67.6        |
|    agent/time/fps                    | 3563        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 96256       |
|    agent/train/approx_kl             | 0.003215547 |
|    agent/train/clip_fraction         | 0.168       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.527      |
|    agent/train/explained_variance    | 0.986       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0346      |
|    agent/train/n_updates             | 460         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 198          |
|    agent/rollout/ep_rew_wrapped_mean | 81.8         |
|    agent/time/fps                    | 3691         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 98304        |
|    agent/train/approx_kl             | 0.0062749507 |
|    agent/train/clip_fraction         | 0.158        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.496       |
|    agent/train/explained_variance    | 0.986        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0333       |
|    agent/train/n_updates             | 470          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 216         |
|    agent/rollout/ep_rew_wrapped_mean | 92.2        |
|    agent/time/fps                    | 3786        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 100352      |
|    agent/train/approx_kl             | 0.003727619 |
|    agent/train/clip_fraction         | 0.164       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.511      |
|    agent/train/explained_variance    | 0.98        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0265      |
|    agent/train/n_updates             | 480         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 235         |
|    agent/rollout/ep_rew_wrapped_mean | 108         |
|    agent/time/fps                    | 3985        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 102400      |
|    agent/train/approx_kl             | 0.005136884 |
|    agent/train/clip_fraction         | 0.184       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.481      |
|    agent/train/explained_variance    | 0.973       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0173      |
|    agent/train/n_updates             | 490         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 251         |
|    agent/rollout/ep_rew_wrapped_mean | 127         |
|    agent/time/fps                    | 3580        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 104448      |
|    agent/train/approx_kl             | 0.006297645 |
|    agent/train/clip_fraction         | 0.213       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.472      |
|    agent/train/explained_variance    | 0.975       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0204      |
|    agent/train/n_updates             | 500         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 268         |
|    agent/rollout/ep_rew_wrapped_mean | 149         |
|    agent/time/fps                    | 3872        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 106496      |
|    agent/train/approx_kl             | 0.004104485 |
|    agent/train/clip_fraction         | 0.202       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.472      |
|    agent/train/explained_variance    | 0.958       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0263      |
|    agent/train/n_updates             | 510         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 286          |
|    agent/rollout/ep_rew_wrapped_mean | 172          |
|    agent/time/fps                    | 3751         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 108544       |
|    agent/train/approx_kl             | 0.0069851046 |
|    agent/train/clip_fraction         | 0.239        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.461       |
|    agent/train/explained_variance    | 0.965        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0527       |
|    agent/train/n_updates             | 520          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 303         |
|    agent/rollout/ep_rew_wrapped_mean | 195         |
|    agent/time/fps                    | 3886        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 110592      |
|    agent/train/approx_kl             | 0.007094875 |
|    agent/train/clip_fraction         | 0.249       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.446      |
|    agent/train/explained_variance    | 0.949       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.023       |
|    agent/train/n_updates             | 530         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 320          |
|    agent/rollout/ep_rew_wrapped_mean | 212          |
|    agent/time/fps                    | 3920         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 112640       |
|    agent/train/approx_kl             | 0.0035189218 |
|    agent/train/clip_fraction         | 0.19         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.442       |
|    agent/train/explained_variance    | 0.978        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00334     |
|    agent/train/n_updates             | 540          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 336         |
|    agent/rollout/ep_rew_wrapped_mean | 231         |
|    agent/time/fps                    | 3625        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 114688      |
|    agent/train/approx_kl             | 0.002618753 |
|    agent/train/clip_fraction         | 0.144       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.436      |
|    agent/train/explained_variance    | 0.988       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0125      |
|    agent/train/n_updates             | 550         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 351         |
|    agent/rollout/ep_rew_wrapped_mean | 248         |
|    agent/time/fps                    | 4094        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 116736      |
|    agent/train/approx_kl             | 0.004130139 |
|    agent/train/clip_fraction         | 0.171       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.441      |
|    agent/train/explained_variance    | 0.987       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0111      |
|    agent/train/n_updates             | 560         |
|    agent/train/policy_gradient_

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 365          |
|    agent/rollout/ep_rew_wrapped_mean | 265          |
|    agent/time/fps                    | 3783         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 118784       |
|    agent/train/approx_kl             | 0.0021720035 |
|    agent/train/clip_fraction         | 0.126        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.432       |
|    agent/train/explained_variance    | 0.977        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -6.36e-05    |
|    agent/train/n_updates             | 570          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 380          |
|    agent/rollout/ep_rew_wrapped_mean | 285          |
|    agent/time/fps                    | 3943         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 120832       |
|    agent/train/approx_kl             | 0.0031098356 |
|    agent/train/clip_fraction         | 0.109        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.416       |
|    agent/train/explained_variance    | 0.838        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0231       |
|    agent/train/n_updates             | 580          |
|    agent/train

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 103 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 500         |
|    agent/rollout/ep_rew_mean         | 395         |
|    agent/rollout/ep_rew_wrapped_mean | 303         |
|    agent/time/fps                    | 3659        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 122880      |
|    agent/train/approx_kl             | 0.004482505 |
|    agent/train/clip_fraction         | 0.141       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.422      |
|    agent/train/explained_variance    | 0.933       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00781     |
|    agent/train/n_updates             | 590         |
|    agent/train/policy_gradient

Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 83 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 500          |
|    agent/rollout/ep_rew_mean         | 409          |
|    agent/rollout/ep_rew_wrapped_mean | 319          |
|    agent/time/fps                    | 3736         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 124928       |
|    agent/train/approx_kl             | 0.0025538523 |
|    agent/train/clip_fraction         | 0.143        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.42        |
|    agent/train/explained_variance    | 0.966        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0336       |
|    agent/train/n_updates             | 600          |
|    agent/train



VBox(children=(Label(value='0.110 MB of 0.110 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▁▁▁▂▂▄▅▆▇███████████████████████████████
time/fps,█▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/approx_kl,▁▃▂▁▂▂▅▂▂▄▂▂▂▇▄▂▅▃▃▃▃▃▂▂▂▃▂▅▃▃█▂▂▂▄▅▅▂▂▂
train/clip_fraction,▁▇▃▂▅▇█▂▂▃▂▂▂▃▂▂▂▂▂▂▂▂▂▂▁▂▁▃▁▂▁▂▁▁▁▂▁▂▁▁
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▅▂▂▃▃▅▆▆▇▇▇▇▇██▇██████████████████████
train/explained_variance,▁▇▇█████████████████████████████████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,501760.0
rollout/ep_len_mean,500.0
rollout/ep_rew_mean,500.0
time/fps,2826.0
train/approx_kl,0.00356
train/clip_fraction,0.08794
train/clip_range,0.1
train/entropy_loss,-0.13681
train/explained_variance,0.99977
train/learning_rate,0.002


100%|████████████████████████████████████████████| 5/5 [31:46<00:00, 381.23s/it]


In [5]:
import pandas as pd
results_df = pd.DataFrame(results, index=["Mean Reward", "Standard Error"]).T

# Save the DataFrame to a CSV file
results_df.to_csv("cartpole_preference_improved100.csv")


In [6]:
results_df

Unnamed: 0,Mean Reward,Standard Error
10,106.144,85.203645
25,13.024,2.674473
50,201.362,97.819847
100,210.562,105.654535
200,160.81,79.040042
300,432.054,60.77275
500,450.048,44.678427


In [None]:
import matplotlib.pyplot as plt

# Ensure the index is integers for plotting
results_df.index = results_df.index.astype(int)

# Plot the data with error bars and connecting lines
plt.figure(figsize=(10, 6))
plt.errorbar(results_df.index, results_df['Mean Reward'], yerr=results_df['Standard Error'], fmt='-o', ecolor='r', capsize=5, label='Mean Reward')

plt.xlabel('Number of Comparisons')
plt.ylabel('Mean Reward')
plt.title('Mean Reward with Standard Error for Different Numbers of Comparisons')
plt.legend()
plt.grid(True)
plt.show()
