In [1]:
import random
from imitation.algorithms import preference_comparisons
from imitation.rewards.reward_nets import BasicRewardNet
from imitation.util.networks import RunningNorm
from imitation.util.util import make_vec_env
from imitation.policies.base import FeedForward32Policy, NormalizeFeaturesExtractor
import gymnasium as gym
from stable_baselines3 import PPO
import numpy as np
from typing import Sequence
import torch as th
import numpy as np

from imitation.data.types import (
    AnyPath,
    TrajectoryPair,
    TrajectoryWithRew,
    TrajectoryWithRewPair,
    Transitions,
)

from imitation.data import rollout



  from pandas.core import (


In [2]:
import seals

In [40]:
class CrossEntropyRewardLossCustom(preference_comparisons.RewardLoss):
    """Compute the cross entropy reward loss."""

    def __init__(self) -> None:
        """Create cross entropy reward loss."""
        super().__init__()

    def forward(
        self,
        fragment_pairs: Sequence[TrajectoryPair],
        preferences: np.ndarray,
        preference_model: preference_comparisons.PreferenceModel,
    ) -> preference_comparisons.LossAndMetrics:
        """Computes the loss.

        Args:
            fragment_pairs: Batch consisting of pairs of trajectory fragments.
            preferences: The probability that the first fragment is preferred
                over the second. Typically 0, 1 or 0.5 (tie).
            preference_model: model to predict the preferred fragment from a pair.

        Returns:
            The cross-entropy loss between the probability predicted by the
                reward model and the target probabilities in `preferences`. Metrics
                are accuracy, and gt_reward_loss, if the ground truth reward is
                available.
        """
        rews_pred = th.empty(2 * len(fragment_pairs), dtype=th.float32)
        rews_true = th.empty(2 * len(fragment_pairs), dtype=th.float32)

        cnt1 = 0
        cnt2 = 0
        
        for fragment in fragment_pairs:
            frag1, frag2 = fragment
            trans1 = rollout.flatten_trajectories([frag1])
            trans2 = rollout.flatten_trajectories([frag2])
            rews_pred[cnt1] = preference_model.rewards(trans1).sum()
            cnt1 += 1
            rews_pred[cnt1] = preference_model.rewards(trans2).sum()
            cnt1 += 1

            rews_true[cnt2] = th.tensor(frag1.rews.sum(), dtype=th.float32)
            cnt2 += 1
            rews_true[cnt2] = th.tensor(frag2.rews.sum(), dtype=th.float32)
            cnt2 += 1


        loss = th.nn.MSELoss()(rews_pred, rews_true)
        
        metrics = {}
        metrics["accuracy"] = th.tensor(0)
        metrics = {key: value.detach().cpu() for key, value in metrics.items()}
        return preference_comparisons.LossAndMetrics(
            loss=loss,
            metrics=metrics,
        )

In [44]:
rng = np.random.default_rng(0)

venv = make_vec_env("seals/CartPole-v0", rng=rng)

reward_net = BasicRewardNet(
    venv.observation_space, venv.action_space, normalize_input_layer=RunningNorm
)

fragmenter = preference_comparisons.RandomFragmenter(
    warning_threshold=0,
    rng=rng,
)
gatherer = preference_comparisons.SyntheticGatherer(rng=rng)
preference_model = preference_comparisons.PreferenceModel(reward_net)
reward_trainer = preference_comparisons.BasicRewardTrainer(
    preference_model=preference_model,
    loss=CrossEntropyRewardLossCustom(),
    epochs=3,
    lr=0.001,
    rng=rng,
)


# Several hyperparameters (reward_epochs, ppo_clip_range, ppo_ent_coef,
# ppo_gae_lambda, ppo_n_epochs, discount_factor, use_sde, sde_sample_freq,
# ppo_lr, exploration_frac, num_iterations, initial_comparison_frac,
# initial_epoch_multiplier, query_schedule) used in this example have been
# approximately fine-tuned to reach a reasonable level of performance.
agent = PPO(
    policy=FeedForward32Policy,
    policy_kwargs=dict(
        features_extractor_class=NormalizeFeaturesExtractor,
        features_extractor_kwargs=dict(normalize_class=RunningNorm),
    ),
    env=venv,
    seed=0,
    n_steps=2048 // venv.num_envs,
    batch_size=64,
    ent_coef=0.01,
    learning_rate=2e-3,
    clip_range=0.1,
    gae_lambda=0.95,
    gamma=0.97,
    n_epochs=10,
)

trajectory_generator = preference_comparisons.AgentTrainer(
    algorithm=agent,
    reward_fn=reward_net,
    venv=venv,
    exploration_frac=0.05,
    rng=rng,
)

pref_comparisons = preference_comparisons.PreferenceComparisons(
    trajectory_generator,
    reward_net,
    num_iterations=60,  # Set to 60 for better performance
    fragmenter=fragmenter,
    preference_gatherer=gatherer,
    reward_trainer=reward_trainer,
    fragment_length=100,
    transition_oversampling=1,
    initial_comparison_frac=0.1,
    allow_variable_horizon=False,
    initial_epoch_multiplier=4,
    query_schedule="hyperbolic",
)

pref_comparisons.train(
    total_timesteps=50_000,
    total_comparisons=300,
)

Query schedule: [30, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3]
Collecting 60 fragments (6000 transitions)
Requested 5700 transitions but only 0 in buffer. Sampling 5700 additional transitions.
Sampling 300 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 30 comparisons


Training reward model:   0%|          | 0/12 [00:00<?, ?it/s]

Training agent for 833 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_rew_wrapped_mean | -296     |
|    agent/time/fps                    | 19750    |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_rew_wrapped_mean | -296     |
|    agent/time/fps                    | 1.98e+04 |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2.05e+03 |
|    agent/train/approx_kl             | 0.00201  |
|    agent/train/clip_fraction         | 0.0806   |
|    agent/train/clip_range            | 0.1      |
|    agent/train/entropy_loss  

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | -166         |
|    agent/time/fps                    | 22307        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0020143995 |
|    agent/train/clip_fraction         | 0.0806       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.691       |
|    agent/train/explained_variance    | -1.87        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0105       |
|    agent/train/n_updates             | 10           |
|    agent/train/policy_gradient_loss  | -0.00364     |
|    agent/train/value_loss            | 0.0451       |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | -96.9        |
|    agent/time/fps                    | 21725        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0030591032 |
|    agent/train/clip_fraction         | 0.103        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.689       |
|    agent/train/explained_variance    | 0.643        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00308      |
|    agent/train/n_updates             | 20           |
|    agent/train/policy_gradient_loss  | -0.00391     |
|    agent/train/value_loss            | 0.0534       |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | -57.2        |
|    agent/time/fps                    | 23597        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0012439466 |
|    agent/train/clip_fraction         | 0.0708       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.686       |
|    agent/train/explained_variance    | 0.771        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0108       |
|    agent/train/n_updates             | 30           |
|    agent/train/policy_gradient_loss  | -0.00156     |
|    agent/train/value_loss            | 0.0543       |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | -36.3       |
|    agent/time/fps                    | 22814       |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 10240       |
|    agent/train/approx_kl             | 0.002770629 |
|    agent/train/clip_fraction         | 0.149       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.673      |
|    agent/train/explained_variance    | 0.631       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0093     |
|    agent/train/n_updates             | 40          |
|    agent/train/policy_gradient_loss  | -0.00663    |
|    agent/train/value_loss            | 0.0248      |
--------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_rew_wrapped_mean | -15.9      |
|    agent/time/fps                    | 22582      |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 12288      |
|    agent/train/approx_kl             | 0.00370212 |
|    agent/train/clip_fraction         | 0.196      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.673     |
|    agent/train/explained_variance    | 0.927      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.00874    |
|    agent/train/n_updates             | 50         |
|    agent/train/policy_gradient_loss  | -0.0112    |
|    agent/train/value_loss            | 0.0158     |
-------------------------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | 33.3        |
|    agent/time/fps                    | 19802       |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 14336       |
|    agent/train/approx_kl             | 0.004168664 |
|    agent/train/clip_fraction         | 0.167       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.66       |
|    agent/train/explained_variance    | 0.801       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00633    |
|    agent/train/n_updates             | 60          |
|    agent/train/policy_gradient_loss  | -0.00592    |
|    agent/train/value_loss            | 0.0123      |
--------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 44.5         |
|    agent/time/fps                    | 21248        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 16384        |
|    agent/train/approx_kl             | 0.0041376296 |
|    agent/train/clip_fraction         | 0.192        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.641       |
|    agent/train/explained_variance    | 0.747        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00513      |
|    agent/train/n_updates             | 70           |
|    agent/train/policy_gradient_loss  | -0.0083      |
|    agent/train/value_loss            | 0.00936      |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 36.3         |
|    agent/time/fps                    | 23582        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 18432        |
|    agent/train/approx_kl             | 0.0041196425 |
|    agent/train/clip_fraction         | 0.223        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.651       |
|    agent/train/explained_variance    | 0.545        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.017       |
|    agent/train/n_updates             | 80           |
|    agent/train/policy_gradient_loss  | -0.00814     |
|    agent/train/value_loss            | 0.00493      |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 27.2         |
|    agent/time/fps                    | 21815        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 20480        |
|    agent/train/approx_kl             | 0.0061916243 |
|    agent/train/clip_fraction         | 0.332        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.63        |
|    agent/train/explained_variance    | 0.842        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0415      |
|    agent/train/n_updates             | 90           |
|    agent/train/policy_gradient_loss  | -0.0159      |
|    agent/train/value_loss            | 0.00343      |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | 20.8        |
|    agent/time/fps                    | 21332       |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 22528       |
|    agent/train/approx_kl             | 0.006640197 |
|    agent/train/clip_fraction         | 0.271       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.613      |
|    agent/train/explained_variance    | 0.913       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0115     |
|    agent/train/n_updates             | 100         |
|    agent/train/policy_gradient_loss  | -0.0113     |
|    agent/train/value_loss            | 0.00357     |
--------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | 17.2        |
|    agent/time/fps                    | 4921        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 24576       |
|    agent/train/approx_kl             | 0.007341261 |
|    agent/train/clip_fraction         | 0.238       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.588      |
|    agent/train/explained_variance    | 0.927       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00948     |
|    agent/train/n_updates             | 110         |
|    agent/train/policy_gradient_loss  | -0.00806    |
|    agent/train/value_loss            | 0.00464     |
--------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 16.7         |
|    agent/time/fps                    | 21233        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 26624        |
|    agent/train/approx_kl             | 0.0044561257 |
|    agent/train/clip_fraction         | 0.173        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.539       |
|    agent/train/explained_variance    | 0.946        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0113      |
|    agent/train/n_updates             | 120          |
|    agent/train/policy_gradient_loss  | -0.0047      |
|    agent/train/value_loss            | 0.0039       |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 18.1         |
|    agent/time/fps                    | 23922        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 28672        |
|    agent/train/approx_kl             | 0.0039471877 |
|    agent/train/clip_fraction         | 0.223        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.528       |
|    agent/train/explained_variance    | 0.962        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0218      |
|    agent/train/n_updates             | 130          |
|    agent/train/policy_gradient_loss  | -0.00952     |
|    agent/train/value_loss            | 0.00674      |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 19.2         |
|    agent/time/fps                    | 24150        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 30720        |
|    agent/train/approx_kl             | 0.0033466732 |
|    agent/train/clip_fraction         | 0.164        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.49        |
|    agent/train/explained_variance    | 0.944        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0181      |
|    agent/train/n_updates             | 140          |
|    agent/train/policy_gradient_loss  | -0.00522     |
|    agent/train/value_loss            | 0.0104       |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 19.3         |
|    agent/time/fps                    | 23708        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0035190007 |
|    agent/train/clip_fraction         | 0.164        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.444       |
|    agent/train/explained_variance    | 0.929        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00723     |
|    agent/train/n_updates             | 150          |
|    agent/train/policy_gradient_loss  | -0.00695     |
|    agent/train/value_loss            | 0.00624      |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 19.3         |
|    agent/time/fps                    | 24080        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 34816        |
|    agent/train/approx_kl             | 0.0046203807 |
|    agent/train/clip_fraction         | 0.16         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.428       |
|    agent/train/explained_variance    | 0.906        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00906     |
|    agent/train/n_updates             | 160          |
|    agent/train/policy_gradient_loss  | -0.00532     |
|    agent/train/value_loss            | 0.00396      |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | 20.4        |
|    agent/time/fps                    | 22085       |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 36864       |
|    agent/train/approx_kl             | 0.003545674 |
|    agent/train/clip_fraction         | 0.147       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.423      |
|    agent/train/explained_variance    | 0.931       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0114      |
|    agent/train/n_updates             | 170         |
|    agent/train/policy_gradient_loss  | -0.00379    |
|    agent/train/value_loss            | 0.00447     |
--------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 20.5         |
|    agent/time/fps                    | 23533        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 38912        |
|    agent/train/approx_kl             | 0.0042150137 |
|    agent/train/clip_fraction         | 0.175        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.409       |
|    agent/train/explained_variance    | 0.93         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0166      |
|    agent/train/n_updates             | 180          |
|    agent/train/policy_gradient_loss  | -0.00635     |
|    agent/train/value_loss            | 0.00543      |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 20.1         |
|    agent/time/fps                    | 23480        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 40960        |
|    agent/train/approx_kl             | 0.0063265287 |
|    agent/train/clip_fraction         | 0.181        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.442       |
|    agent/train/explained_variance    | 0.95         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.015       |
|    agent/train/n_updates             | 190          |
|    agent/train/policy_gradient_loss  | -0.00803     |
|    agent/train/value_loss            | 0.00553      |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | 19.3        |
|    agent/time/fps                    | 22851       |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 43008       |
|    agent/train/approx_kl             | 0.003530215 |
|    agent/train/clip_fraction         | 0.207       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.437      |
|    agent/train/explained_variance    | 0.959       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0206     |
|    agent/train/n_updates             | 200         |
|    agent/train/policy_gradient_loss  | -0.0082     |
|    agent/train/value_loss            | 0.00487     |
--------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 19.2         |
|    agent/time/fps                    | 22142        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 45056        |
|    agent/train/approx_kl             | 0.0062283296 |
|    agent/train/clip_fraction         | 0.239        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.442       |
|    agent/train/explained_variance    | 0.962        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0199      |
|    agent/train/n_updates             | 210          |
|    agent/train/policy_gradient_loss  | -0.00992     |
|    agent/train/value_loss            | 0.00548      |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 19.7         |
|    agent/time/fps                    | 21255        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 47104        |
|    agent/train/approx_kl             | 0.0038637496 |
|    agent/train/clip_fraction         | 0.223        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.433       |
|    agent/train/explained_variance    | 0.951        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00254      |
|    agent/train/n_updates             | 220          |
|    agent/train/policy_gradient_loss  | -0.00728     |
|    agent/train/value_loss            | 0.00519      |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 19.9         |
|    agent/time/fps                    | 21302        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 49152        |
|    agent/train/approx_kl             | 0.0045190607 |
|    agent/train/clip_fraction         | 0.221        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.429       |
|    agent/train/explained_variance    | 0.95         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0265      |
|    agent/train/n_updates             | 230          |
|    agent/train/policy_gradient_loss  | -0.00886     |
|    agent/train/value_loss            | 0.00513      |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 20.5         |
|    agent/time/fps                    | 15175        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 51200        |
|    agent/train/approx_kl             | 0.0025752438 |
|    agent/train/clip_fraction         | 0.158        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.399       |
|    agent/train/explained_variance    | 0.926        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.012       |
|    agent/train/n_updates             | 240          |
|    agent/train/policy_gradient_loss  | -0.00433     |
|    agent/train/value_loss            | 0.00398      |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | 21.1        |
|    agent/time/fps                    | 23316       |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 53248       |
|    agent/train/approx_kl             | 0.004750454 |
|    agent/train/clip_fraction         | 0.19        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.412      |
|    agent/train/explained_variance    | 0.933       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00914    |
|    agent/train/n_updates             | 250         |
|    agent/train/policy_gradient_loss  | -0.0053     |
|    agent/train/value_loss            | 0.00367     |
--------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | 21.6        |
|    agent/time/fps                    | 13227       |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 55296       |
|    agent/train/approx_kl             | 0.001510687 |
|    agent/train/clip_fraction         | 0.105       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.425      |
|    agent/train/explained_variance    | 0.949       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0157     |
|    agent/train/n_updates             | 260         |
|    agent/train/policy_gradient_loss  | -0.00158    |
|    agent/train/value_loss            | 0.00348     |
--------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 21.6         |
|    agent/time/fps                    | 22597        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 57344        |
|    agent/train/approx_kl             | 0.0023433478 |
|    agent/train/clip_fraction         | 0.11         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.413       |
|    agent/train/explained_variance    | 0.952        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0111      |
|    agent/train/n_updates             | 270          |
|    agent/train/policy_gradient_loss  | -0.0019      |
|    agent/train/value_loss            | 0.00647      |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 21.3         |
|    agent/time/fps                    | 23569        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 59392        |
|    agent/train/approx_kl             | 0.0023488472 |
|    agent/train/clip_fraction         | 0.11         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.413       |
|    agent/train/explained_variance    | 0.958        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00304      |
|    agent/train/n_updates             | 280          |
|    agent/train/policy_gradient_loss  | -0.00125     |
|    agent/train/value_loss            | 0.00457      |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 20.6         |
|    agent/time/fps                    | 23318        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 61440        |
|    agent/train/approx_kl             | 0.0034422912 |
|    agent/train/clip_fraction         | 0.166        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.422       |
|    agent/train/explained_variance    | 0.95         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.017       |
|    agent/train/n_updates             | 290          |
|    agent/train/policy_gradient_loss  | -0.00352     |
|    agent/train/value_loss            | 0.00722      |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 19.8         |
|    agent/time/fps                    | 22849        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 63488        |
|    agent/train/approx_kl             | 0.0051054745 |
|    agent/train/clip_fraction         | 0.173        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.453       |
|    agent/train/explained_variance    | 0.869        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0154      |
|    agent/train/n_updates             | 300          |
|    agent/train/policy_gradient_loss  | -0.00284     |
|    agent/train/value_loss            | 0.0115       |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_rew_wrapped_mean | 19.1       |
|    agent/time/fps                    | 19150      |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 65536      |
|    agent/train/approx_kl             | 0.00293953 |
|    agent/train/clip_fraction         | 0.154      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.486     |
|    agent/train/explained_variance    | 0.775      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.00445    |
|    agent/train/n_updates             | 310        |
|    agent/train/policy_gradient_loss  | -0.00257   |
|    agent/train/value_loss            | 0.0119     |
-------------------------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 18.6         |
|    agent/time/fps                    | 21628        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 67584        |
|    agent/train/approx_kl             | 0.0043083206 |
|    agent/train/clip_fraction         | 0.181        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.487       |
|    agent/train/explained_variance    | 0.917        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0348       |
|    agent/train/n_updates             | 320          |
|    agent/train/policy_gradient_loss  | -0.00304     |
|    agent/train/value_loss            | 0.00707      |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | 17.5        |
|    agent/time/fps                    | 21696       |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 69632       |
|    agent/train/approx_kl             | 0.004002239 |
|    agent/train/clip_fraction         | 0.185       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.517      |
|    agent/train/explained_variance    | 0.871       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0175      |
|    agent/train/n_updates             | 330         |
|    agent/train/policy_gradient_loss  | -0.00456    |
|    agent/train/value_loss            | 0.011       |
--------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 16.6         |
|    agent/time/fps                    | 23871        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 71680        |
|    agent/train/approx_kl             | 0.0043434156 |
|    agent/train/clip_fraction         | 0.203        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.523       |
|    agent/train/explained_variance    | 0.927        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0159      |
|    agent/train/n_updates             | 340          |
|    agent/train/policy_gradient_loss  | -0.00812     |
|    agent/train/value_loss            | 0.00767      |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 15.8         |
|    agent/time/fps                    | 24368        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 73728        |
|    agent/train/approx_kl             | 0.0048871664 |
|    agent/train/clip_fraction         | 0.25         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.528       |
|    agent/train/explained_variance    | 0.95         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0112      |
|    agent/train/n_updates             | 350          |
|    agent/train/policy_gradient_loss  | -0.0109      |
|    agent/train/value_loss            | 0.0112       |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 15.5         |
|    agent/time/fps                    | 24000        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 75776        |
|    agent/train/approx_kl             | 0.0040233885 |
|    agent/train/clip_fraction         | 0.242        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.579       |
|    agent/train/explained_variance    | 0.901        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0195      |
|    agent/train/n_updates             | 360          |
|    agent/train/policy_gradient_loss  | -0.00735     |
|    agent/train/value_loss            | 0.0154       |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 15.1         |
|    agent/time/fps                    | 23790        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 77824        |
|    agent/train/approx_kl             | 0.0047922824 |
|    agent/train/clip_fraction         | 0.274        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.574       |
|    agent/train/explained_variance    | 0.947        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00545      |
|    agent/train/n_updates             | 370          |
|    agent/train/policy_gradient_loss  | -0.0112      |
|    agent/train/value_loss            | 0.0109       |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 14.9         |
|    agent/time/fps                    | 23738        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 79872        |
|    agent/train/approx_kl             | 0.0037874058 |
|    agent/train/clip_fraction         | 0.254        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.583       |
|    agent/train/explained_variance    | 0.921        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00782     |
|    agent/train/n_updates             | 380          |
|    agent/train/policy_gradient_loss  | -0.00708     |
|    agent/train/value_loss            | 0.0149       |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | 14.9        |
|    agent/time/fps                    | 23950       |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 81920       |
|    agent/train/approx_kl             | 0.004605065 |
|    agent/train/clip_fraction         | 0.28        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.581      |
|    agent/train/explained_variance    | 0.953       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0371     |
|    agent/train/n_updates             | 390         |
|    agent/train/policy_gradient_loss  | -0.00931    |
|    agent/train/value_loss            | 0.0133      |
--------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 15.6         |
|    agent/time/fps                    | 23353        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 83968        |
|    agent/train/approx_kl             | 0.0040751603 |
|    agent/train/clip_fraction         | 0.241        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.575       |
|    agent/train/explained_variance    | 0.948        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.014        |
|    agent/train/n_updates             | 400          |
|    agent/train/policy_gradient_loss  | -0.00966     |
|    agent/train/value_loss            | 0.0116       |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | 16          |
|    agent/time/fps                    | 24061       |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 86016       |
|    agent/train/approx_kl             | 0.005794633 |
|    agent/train/clip_fraction         | 0.288       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.553      |
|    agent/train/explained_variance    | 0.941       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0232     |
|    agent/train/n_updates             | 410         |
|    agent/train/policy_gradient_loss  | -0.0104     |
|    agent/train/value_loss            | 0.0124      |
--------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 16.4         |
|    agent/time/fps                    | 23987        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 88064        |
|    agent/train/approx_kl             | 0.0062578293 |
|    agent/train/clip_fraction         | 0.269        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.542       |
|    agent/train/explained_variance    | 0.941        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0134      |
|    agent/train/n_updates             | 420          |
|    agent/train/policy_gradient_loss  | -0.00923     |
|    agent/train/value_loss            | 0.0131       |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 16.9         |
|    agent/time/fps                    | 24021        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 90112        |
|    agent/train/approx_kl             | 0.0071030343 |
|    agent/train/clip_fraction         | 0.26         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.496       |
|    agent/train/explained_variance    | 0.952        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0155      |
|    agent/train/n_updates             | 430          |
|    agent/train/policy_gradient_loss  | -0.0123      |
|    agent/train/value_loss            | 0.0135       |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | 17.6        |
|    agent/time/fps                    | 22699       |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 92160       |
|    agent/train/approx_kl             | 0.007844508 |
|    agent/train/clip_fraction         | 0.266       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.478      |
|    agent/train/explained_variance    | 0.932       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0044     |
|    agent/train/n_updates             | 440         |
|    agent/train/policy_gradient_loss  | -0.00865    |
|    agent/train/value_loss            | 0.0113      |
--------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 18.8         |
|    agent/time/fps                    | 23931        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 94208        |
|    agent/train/approx_kl             | 0.0071424074 |
|    agent/train/clip_fraction         | 0.212        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.412       |
|    agent/train/explained_variance    | 0.961        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0152      |
|    agent/train/n_updates             | 450          |
|    agent/train/policy_gradient_loss  | -0.0107      |
|    agent/train/value_loss            | 0.00878      |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 19.4         |
|    agent/time/fps                    | 22843        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 96256        |
|    agent/train/approx_kl             | 0.0053970115 |
|    agent/train/clip_fraction         | 0.16         |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.392       |
|    agent/train/explained_variance    | 0.951        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0214       |
|    agent/train/n_updates             | 460          |
|    agent/train/policy_gradient_loss  | -0.00489     |
|    agent/train/value_loss            | 0.0113       |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | 19.8        |
|    agent/time/fps                    | 22391       |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 98304       |
|    agent/train/approx_kl             | 0.008454105 |
|    agent/train/clip_fraction         | 0.208       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.397      |
|    agent/train/explained_variance    | 0.958       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.00751     |
|    agent/train/n_updates             | 470         |
|    agent/train/policy_gradient_loss  | -0.0075     |
|    agent/train/value_loss            | 0.0104      |
--------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 20.4         |
|    agent/time/fps                    | 22988        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 100352       |
|    agent/train/approx_kl             | 0.0070155305 |
|    agent/train/clip_fraction         | 0.265        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.425       |
|    agent/train/explained_variance    | 0.953        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0173      |
|    agent/train/n_updates             | 480          |
|    agent/train/policy_gradient_loss  | -0.0118      |
|    agent/train/value_loss            | 0.0109       |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_rew_wrapped_mean | 20.4       |
|    agent/time/fps                    | 23357      |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 102400     |
|    agent/train/approx_kl             | 0.01093177 |
|    agent/train/clip_fraction         | 0.28       |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.441     |
|    agent/train/explained_variance    | 0.939      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.015     |
|    agent/train/n_updates             | 490        |
|    agent/train/policy_gradient_loss  | -0.011     |
|    agent/train/value_loss            | 0.0143     |
-------------------------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | 21.4        |
|    agent/time/fps                    | 22420       |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 104448      |
|    agent/train/approx_kl             | 0.005570894 |
|    agent/train/clip_fraction         | 0.255       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.479      |
|    agent/train/explained_variance    | 0.96        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0101     |
|    agent/train/n_updates             | 500         |
|    agent/train/policy_gradient_loss  | -0.0102     |
|    agent/train/value_loss            | 0.0121      |
--------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | 21.6        |
|    agent/time/fps                    | 23449       |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 106496      |
|    agent/train/approx_kl             | 0.005423213 |
|    agent/train/clip_fraction         | 0.238       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.487      |
|    agent/train/explained_variance    | 0.92        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00977    |
|    agent/train/n_updates             | 510         |
|    agent/train/policy_gradient_loss  | -0.00947    |
|    agent/train/value_loss            | 0.0151      |
--------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 21.5         |
|    agent/time/fps                    | 23412        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 108544       |
|    agent/train/approx_kl             | 0.0069611417 |
|    agent/train/clip_fraction         | 0.256        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.517       |
|    agent/train/explained_variance    | 0.939        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.000666     |
|    agent/train/n_updates             | 520          |
|    agent/train/policy_gradient_loss  | -0.0103      |
|    agent/train/value_loss            | 0.0169       |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | 21.9        |
|    agent/time/fps                    | 21762       |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 110592      |
|    agent/train/approx_kl             | 0.005987121 |
|    agent/train/clip_fraction         | 0.281       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.543      |
|    agent/train/explained_variance    | 0.929       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.028       |
|    agent/train/n_updates             | 530         |
|    agent/train/policy_gradient_loss  | -0.0102     |
|    agent/train/value_loss            | 0.0137      |
--------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | 22.6        |
|    agent/time/fps                    | 24050       |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 112640      |
|    agent/train/approx_kl             | 0.005923879 |
|    agent/train/clip_fraction         | 0.307       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.556      |
|    agent/train/explained_variance    | 0.931       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0158     |
|    agent/train/n_updates             | 540         |
|    agent/train/policy_gradient_loss  | -0.0113     |
|    agent/train/value_loss            | 0.0155      |
--------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_rew_wrapped_mean | 24.1       |
|    agent/time/fps                    | 20067      |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 114688     |
|    agent/train/approx_kl             | 0.00579118 |
|    agent/train/clip_fraction         | 0.326      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.568     |
|    agent/train/explained_variance    | 0.941      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.000797   |
|    agent/train/n_updates             | 550        |
|    agent/train/policy_gradient_loss  | -0.0115    |
|    agent/train/value_loss            | 0.016      |
-------------------------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_rew_wrapped_mean | 25.6       |
|    agent/time/fps                    | 17226      |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 116736     |
|    agent/train/approx_kl             | 0.00924789 |
|    agent/train/clip_fraction         | 0.313      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -0.555     |
|    agent/train/explained_variance    | 0.793      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.00253   |
|    agent/train/n_updates             | 560        |
|    agent/train/policy_gradient_loss  | -0.00525   |
|    agent/train/value_loss            | 0.0225     |
-------------------------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 27.4         |
|    agent/time/fps                    | 19973        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 118784       |
|    agent/train/approx_kl             | 0.0069552762 |
|    agent/train/clip_fraction         | 0.249        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.539       |
|    agent/train/explained_variance    | 0.891        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00166     |
|    agent/train/n_updates             | 570          |
|    agent/train/policy_gradient_loss  | -0.00634     |
|    agent/train/value_loss            | 0.0155       |
---------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | 29.8        |
|    agent/time/fps                    | 21143       |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 120832      |
|    agent/train/approx_kl             | 0.005485727 |
|    agent/train/clip_fraction         | 0.22        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.538      |
|    agent/train/explained_variance    | 0.92        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0142     |
|    agent/train/n_updates             | 580         |
|    agent/train/policy_gradient_loss  | -0.00535    |
|    agent/train/value_loss            | 0.0169      |
--------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 853 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | 31.4        |
|    agent/time/fps                    | 22207       |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 122880      |
|    agent/train/approx_kl             | 0.004942756 |
|    agent/train/clip_fraction         | 0.196       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -0.547      |
|    agent/train/explained_variance    | 0.938       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.000154   |
|    agent/train/n_updates             | 590         |
|    agent/train/policy_gradient_loss  | -0.00223    |
|    agent/train/value_loss            | 0.0187      |
--------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 833 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | 33.5         |
|    agent/time/fps                    | 23061        |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 124928       |
|    agent/train/approx_kl             | 0.0062115993 |
|    agent/train/clip_fraction         | 0.206        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -0.54        |
|    agent/train/explained_variance    | 0.937        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0104       |
|    agent/train/n_updates             | 600          |
|    agent/train/policy_gradient_loss  | -0.00466     |
|    agent/train/value_loss            | 0.0326       |
---------------

{'reward_loss': 9.597181844711304, 'reward_accuracy': 0.0}

In [47]:
from imitation.rewards.reward_wrapper import RewardVecEnvWrapper

learned_reward_venv = RewardVecEnvWrapper(venv, reward_net.predict_processed)

learner = PPO(
    seed=0,
    policy=FeedForward32Policy,
    policy_kwargs=dict(
        features_extractor_class=NormalizeFeaturesExtractor,
        features_extractor_kwargs=dict(normalize_class=RunningNorm),
    ),
    env=learned_reward_venv,
    batch_size=64,
    ent_coef=0.01,
    n_epochs=10,
    n_steps=2048 // learned_reward_venv.num_envs,
    clip_range=0.1,
    gae_lambda=0.95,
    gamma=0.97,
    learning_rate=2e-3,
)
learner.learn(100_000)  # Note: set to 100_000 to train a proficient expert

from stable_baselines3.common.evaluation import evaluate_policy

n_eval_episodes = 10
reward_mean, reward_std = evaluate_policy(learner.policy, venv, n_eval_episodes)
reward_stderr = reward_std / np.sqrt(n_eval_episodes)
print(f"Reward: {reward_mean:.0f} +/- {reward_stderr:.0f}")

Reward: 500 +/- 0
