In [1]:
#!pip install compiler_gym 'ray[default,rllib]' &>/dev/null || echo "Install failed!"

import compiler_gym
import ray

from ray.rllib.agents.ppo import PPOTrainer
from compiler_gym.wrappers import ConstrainedCommandline, TimeLimit
from ray import tune
from itertools import islice
from compiler_gym.wrappers import CycleOverBenchmarks
from compiler_gym.util.registration import register

import loop_tool_service

from service_py.datasets import loop_tool_dataset
from service_py.rewards import flops_loop_nest_reward, flops_reward, runtime_reward
import wandb
wandb.init(project="loop_tool", entity="dejang", sync_tensorboard=True)

  from .autonotebook import tqdm as notebook_tqdm
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdejang[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
def make_env() -> compiler_gym.envs.CompilerEnv:
    """Make the reinforcement learning environment for this experiment."""
    
    env = loop_tool_service.make(
        "loop_tool_env-v0",
        observation_space="5_prev_actions_tensor",
        reward_space="flops_loop_nest_tensor",
    )

    env = TimeLimit(env, max_episode_steps=10)
    return env

In [3]:
with make_env() as env:
    print("Action space:", env.action_space)
    print("Observation space:", env.observation_space)
    print("Reward space:", env.reward_space)

Action space: NamedDiscrete([up, down, swap_up, swap_down])
Observation space: Box([[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]], [[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]], (1, 20), float32)
Reward space: flops_loop_nest_tensor


In [4]:
with make_env() as env:
    # The two datasets we will be using:
    lt_dataset = env.datasets["loop_tool_simple-v0"]
    # train_benchmarks = list(islice(lt_dataset.benchmarks(), 1))
    # test_benchmarks = list(islice(lt_dataset.benchmarks(), 2))
    
    bench = ["benchmark://loop_tool_simple-v0/simple"]
            #  "benchmark://loop_tool_simple-v0/mm128", 
            #  "benchmark://loop_tool_simple-v0/mm"] 

    train_benchmarks = bench
    test_benchmarks = bench

print("Number of benchmarks for training:", len(train_benchmarks))
print("Number of benchmarks for testing:", len(test_benchmarks))


Number of benchmarks for training: 1
Number of benchmarks for testing: 1


In [5]:
def make_training_env(*args) -> compiler_gym.envs.CompilerEnv:
    """Make a reinforcement learning environment that cycles over the
    set of training benchmarks in use.
    """
    del args  # Unused env_config argument passed by ray
    return CycleOverBenchmarks(make_env(), train_benchmarks)


In [6]:
env = make_training_env()
env.reset()

E0705 15:25:02.042448 140184553244224 example_service.py:263] CRITICAL - 

Working_dir = /dev/shm/compiler_gym_dejang/s/0705T152501-249505-dbfd



array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]], dtype=float32)

In [7]:
if ray.is_initialized():
    ray.shutdown()
ray.init(include_dashboard=False, ignore_reinit_error=True)

tune.register_env("compiler_gym", make_training_env)

In [8]:
import time
from ray import tune
from ray.tune import Stopper

class TimeStopper(Stopper):
    def __init__(self):
        self._start = time.time()
        self._deadline = 30

    def __call__(self, trial_id, result):
        return False

    def stop_all(self):
        return time.time() - self._start > self._deadline


In [9]:
PPO_CONFIG = {
    "log_level": "ERROR",
    "seed": 0xCC,
    "num_workers": 2,
    # Specify the environment to use, where "compiler_gym" is the name we 
    # passed to tune.register_env().
    "env": "compiler_gym",
    # Reduce the size of the batch/trajectory lengths to match our short 
    # training run.
    "rollout_fragment_length": 5,
    "train_batch_size": 5,
    "sgd_minibatch_size": 5,
    "gamma": 0.8, #tune.grid_search([0.5, 0.8, 0.9]), # def 0.99
    "lr": 1e-4, #tune.grid_search([0.01, 0.001, 0.0001]), # def 1e-4
    "horizon": 3, # def None
    "soft_horizon": True,
    "evaluation_interval": 5, # def None
    "evaluation_num_episodes": 1, # def 10
    "model": {'fcnet_hiddens': [5, 5]}
    # "model": {                            # The NN model we'll optimize.
    #     'fcnet_hiddens': [                # "Fully-connected network with N hidden layers".
    #         tune.grid_search([20, 40]),   # Try these four values for layer one.
    #         tune.grid_search([20, 40])    # Try these four values for layer two.
    #     ]
    # },
}

In [10]:
analysis = tune.run(
    PPOTrainer,
    fail_fast=True,
    checkpoint_at_end=True,
    # stop=TimeStopper(),
    stop={
        "episodes_total": 1000,
        # "episode_reward_mean": 30
    },
    config=PPO_CONFIG
)

[2m[36m(PPOTrainer pid=560340)[0m 2022-07-05 15:25:08,982	INFO trainer.py:2332 -- Your framework setting is 'tf', meaning you are using static-graph mode. Set framework='tf2' to enable eager execution with tf2.x. You may also then want to set eager_tracing=True in order to reach similar execution speed as with static-graph mode.
[2m[36m(PPOTrainer pid=560340)[0m 2022-07-05 15:25:09,199	INFO ppo.py:414 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(PPOTrainer pid=560340)[0m 2022-07-05 15:25:09,199	INFO trainer.py:903 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(RolloutWorker pid=560394)[0m E0705 15:25:14.370913 140252437153344 example_service.py:263] CRITICAL - 
[2m[36m(RolloutWorker pid=560394)[0m 
[2m[36m(RolloutWorker pid=560394)[0m Working_dir = /dev/shm/compiler_gym_

Trial name,status,loc
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340


[2m[36m(PPOTrainer pid=560340)[0m 2022-07-05 15:25:19,283	INFO trainable.py:159 -- Trainable.setup took 10.302 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(RolloutWorker pid=560394)[0m E0705 15:25:19.297726 140252437153344 example_service.py:263] CRITICAL - 
[2m[36m(RolloutWorker pid=560394)[0m 
[2m[36m(RolloutWorker pid=560394)[0m Working_dir = /dev/shm/compiler_gym_dejang/s/0705T152513-338012-0236
[2m[36m(RolloutWorker pid=560394)[0m 
[2m[36m(RolloutWorker pid=560393)[0m E0705 15:25:19.297726 140432008803904 example_service.py:263] CRITICAL - 
[2m[36m(RolloutWorker pid=560393)[0m 
[2m[36m(RolloutWorker pid=560393)[0m Working_dir = /dev/shm/compiler_gym_dejang/s/0705T152513-337364-6f35
[2m[36m(RolloutWorker pid=560393)[0m 


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 8
  counters:
    num_agent_steps_sampled: 8
    num_agent_steps_trained: 8
    num_env_steps_sampled: 8
    num_env_steps_trained: 8
  custom_metrics: {}
  date: 2022-07-05_15-25-23
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 0.016922388863661375
  episode_reward_mean: 0.01018992093293436
  episode_reward_min: 0.0034574530022073446
  episodes_this_iter: 2
  episodes_total: 2
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000298023224
          cur_lr: 9.999999747378752e-05
          entropy: 1.3862930536270142
          entropy_coeff: 0.0
          kl: 1.5437602769452496e-06
          model: {}
          policy_loss: -0.49426084756851196
          total_loss: -0.49416399002075195
          vf_explained_var: 7.748603536583687e-08
      

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,1,4.42793,8,0.0101899,0.0169224,0.00345745,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 24
  counters:
    num_agent_steps_sampled: 24
    num_agent_steps_trained: 24
    num_env_steps_sampled: 24
    num_env_steps_trained: 24
  custom_metrics: {}
  date: 2022-07-05_15-25-30
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 29.928253814293377
  episode_reward_mean: 6.420532641255615
  episode_reward_min: -5.5448363869475905
  episodes_this_iter: 4
  episodes_total: 8
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.05000000074505806
          cur_lr: 9.999999747378752e-05
          entropy: 1.3862723112106323
          entropy_coeff: 0.0
          kl: 8.346512913703918e-06
          model: {}
          policy_loss: -0.24581602215766907
          total_loss: 2.5894381999969482
          vf_explained_var: 0.00013541778025683016
         

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,3,10.7168,24,6.42053,29.9283,-5.54484,3


[2m[36m(RolloutWorker pid=560393)[0m Fatal Python error: Segmentation fault
[2m[36m(RolloutWorker pid=560393)[0m 
[2m[36m(RolloutWorker pid=560393)[0m Current thread 0x00007fb8dffff640 (most recent call first):
[2m[36m(RolloutWorker pid=560393)[0m   File "/home/dejang/loop_tool_env/loop_tool_service/service_py/env/loop_tool_env.py", line 74 in get_available_actions
[2m[36m(RolloutWorker pid=560393)[0m   File "./example_service.py", line 316 in apply_action
[2m[36m(RolloutWorker pid=560393)[0m   File "/home/dejang/anaconda3/envs/compiler_gym/lib/python3.8/site-packages/compiler_gym/service/runtime/compiler_gym_service.py", line 201 in Step
[2m[36m(RolloutWorker pid=560393)[0m   File "/home/dejang/anaconda3/envs/compiler_gym/lib/python3.8/site-packages/grpc/_server.py", line 443 in _call_behavior
[2m[36m(RolloutWorker pid=560393)[0m   File "/home/dejang/anaconda3/envs/compiler_gym/lib/python3.8/site-packages/grpc/_server.py", line 560 in _unary_response_in_pool
[

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,3,10.7168,24,6.42053,29.9283,-5.54484,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 32
  counters:
    num_agent_steps_sampled: 32
    num_agent_steps_trained: 32
    num_env_steps_sampled: 32
    num_env_steps_trained: 32
  custom_metrics: {}
  date: 2022-07-05_15-25-35
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 29.928253814293377
  episode_reward_mean: 2.2581012389619537
  episode_reward_min: -29.963682583498628
  episodes_this_iter: 2
  episodes_total: 10
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.02500000037252903
          cur_lr: 9.999999747378752e-05
          entropy: 1.3862091302871704
          entropy_coeff: 0.0
          kl: 2.0727613900817232e-06
          model: {}
          policy_loss: 0.2542511224746704
          total_loss: 5.562228679656982
          vf_explained_var: -0.00030838249949738383
        

[2m[36m(PPOTrainer pid=560340)[0m E0705 15:25:38.393560 140695939573312 example_service.py:263] CRITICAL - 
[2m[36m(PPOTrainer pid=560340)[0m 
[2m[36m(PPOTrainer pid=560340)[0m Working_dir = /dev/shm/compiler_gym_dejang/s/0705T152516-717902-659b
[2m[36m(PPOTrainer pid=560340)[0m 


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,4,16.3102,32,2.2581,29.9283,-29.9637,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 40
  counters:
    num_agent_steps_sampled: 40
    num_agent_steps_trained: 40
    num_env_steps_sampled: 40
    num_env_steps_trained: 40
  custom_metrics: {}
  date: 2022-07-05_15-25-41
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 29.928253814293377
  episode_reward_mean: 4.32898772247297
  episode_reward_min: -29.963682583498628
  episodes_this_iter: 2
  episodes_total: 12
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -1.4590602784843412
    episode_reward_mean: -1.4590602784843412
    episode_reward_min: -1.4590602784843412
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -1.4590602784843412
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0.187277793

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,7,28.5011,56,1.47092,29.9283,-29.9637,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 72
  counters:
    num_agent_steps_sampled: 72
    num_agent_steps_trained: 72
    num_env_steps_sampled: 72
    num_env_steps_trained: 72
  custom_metrics: {}
  date: 2022-07-05_15-25-56
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 29.928253814293377
  episode_reward_mean: -0.06113682044809162
  episode_reward_min: -29.963682583498628
  episodes_this_iter: 4
  episodes_total: 24
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0007812500116415322
          cur_lr: 9.999999747378752e-05
          entropy: 1.3862438201904297
          entropy_coeff: 0.0
          kl: 1.6466299257444916e-06
          model: {}
          policy_loss: -0.568730354309082
          total_loss: -0.5659816861152649
          vf_explained_var: 0.23570893704891205
      

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,9,36.4185,72,-0.0611368,29.9283,-29.9637,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,9,36.4185,72,-0.0611368,29.9283,-29.9637,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 80
  counters:
    num_agent_steps_sampled: 80
    num_agent_steps_trained: 80
    num_env_steps_sampled: 80
    num_env_steps_trained: 80
  custom_metrics: {}
  date: 2022-07-05_15-26-02
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 29.928253814293377
  episode_reward_mean: -0.0061669844945650976
  episode_reward_min: -29.963682583498628
  episodes_this_iter: 2
  episodes_total: 26
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.015589807871945971
    episode_reward_mean: -0.015589807871945971
    episode_reward_min: -0.015589807871945971
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.015589807871945971
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_m

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,12,49.8196,96,1.06737,34.2536,-29.9637,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 112
  counters:
    num_agent_steps_sampled: 112
    num_agent_steps_trained: 112
    num_env_steps_sampled: 112
    num_env_steps_trained: 112
  custom_metrics: {}
  date: 2022-07-05_15-26-16
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.25356114399076
  episode_reward_mean: 0.9050328473338679
  episode_reward_min: -34.22635813471207
  episodes_this_iter: 2
  episodes_total: 36
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 2.441406286379788e-05
          cur_lr: 9.999999747378752e-05
          entropy: 1.3861278295516968
          entropy_coeff: 0.0
          kl: 3.838016255031107e-06
          model: {}
          policy_loss: 0.7516098618507385
          total_loss: 2.7790138721466064
          vf_explained_var: -0.0006596406456083059
    

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,14,56.4657,112,0.905033,34.2536,-34.2264,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,14,56.4657,112,0.905033,34.2536,-34.2264,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 120
  counters:
    num_agent_steps_sampled: 120
    num_agent_steps_trained: 120
    num_env_steps_sampled: 120
    num_env_steps_trained: 120
  custom_metrics: {}
  date: 2022-07-05_15-26-22
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.25356114399076
  episode_reward_mean: 0.5700955664036595
  episode_reward_min: -34.44558506390648
  episodes_this_iter: 4
  episodes_total: 40
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 1.3917012234340183
    episode_reward_mean: 1.3917012234340183
    episode_reward_min: 1.3917012234340183
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 1.3917012234340183
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0.15835762

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,17,69.7503,136,-0.0330003,34.4903,-34.49,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 152
  counters:
    num_agent_steps_sampled: 152
    num_agent_steps_trained: 152
    num_env_steps_sampled: 152
    num_env_steps_trained: 152
  custom_metrics: {}
  date: 2022-07-05_15-26-37
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.490288723481925
  episode_reward_mean: 0.6840631515171718
  episode_reward_min: -34.489986266107266
  episodes_this_iter: 2
  episodes_total: 50
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 7.629394644936838e-07
          cur_lr: 9.999999747378752e-05
          entropy: 1.386272668838501
          entropy_coeff: 0.0
          kl: 4.187044851278188e-06
          model: {}
          policy_loss: -0.46705392003059387
          total_loss: 3.9676740169525146
          vf_explained_var: 0.0007580498931929469
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,19,77.3469,152,0.684063,34.4903,-34.49,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,19,77.3469,152,0.684063,34.4903,-34.49,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 160
  counters:
    num_agent_steps_sampled: 160
    num_agent_steps_trained: 160
    num_env_steps_sampled: 160
    num_env_steps_trained: 160
  custom_metrics: {}
  date: 2022-07-05_15-26-43
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.490288723481925
  episode_reward_mean: -0.03010272064518691
  episode_reward_min: -34.489986266107266
  episodes_this_iter: 2
  episodes_total: 52
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.011528375400177815
    episode_reward_mean: -0.011528375400177815
    episode_reward_min: -0.011528375400177815
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.011528375400177815
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processin

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,22,90.2665,176,0.455725,34.4903,-34.49,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 192
  counters:
    num_agent_steps_sampled: 192
    num_agent_steps_trained: 192
    num_env_steps_sampled: 192
    num_env_steps_trained: 192
  custom_metrics: {}
  date: 2022-07-05_15-26-57
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.490288723481925
  episode_reward_mean: 0.3420870546187124
  episode_reward_min: -34.489986266107266
  episodes_this_iter: 4
  episodes_total: 64
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 2.3841858265427618e-08
          cur_lr: 9.999999747378752e-05
          entropy: 1.3862097263336182
          entropy_coeff: 0.0
          kl: 8.180185204764712e-07
          model: {}
          policy_loss: 0.2958478033542633
          total_loss: 0.2968146502971649
          vf_explained_var: 0.21754883229732513
    

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,24,97.0944,192,0.342087,34.4903,-34.49,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,24,97.0944,192,0.342087,34.4903,-34.49,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 200
  counters:
    num_agent_steps_sampled: 200
    num_agent_steps_trained: 200
    num_env_steps_sampled: 200
    num_env_steps_trained: 200
  custom_metrics: {}
  date: 2022-07-05_15-27-03
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.490288723481925
  episode_reward_mean: 0.34204484870071267
  episode_reward_min: -34.489986266107266
  episodes_this_iter: 2
  episodes_total: 66
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -1.3812582286654451
    episode_reward_mean: -1.3812582286654451
    episode_reward_min: -1.3812582286654451
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -1.3812582286654451
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0.1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,27,109.193,216,0.397743,34.4903,-34.49,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 232
  counters:
    num_agent_steps_sampled: 232
    num_agent_steps_trained: 232
    num_env_steps_sampled: 232
    num_env_steps_trained: 232
  custom_metrics: {}
  date: 2022-07-05_15-27-15
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.490288723481925
  episode_reward_mean: -0.018300560154286306
  episode_reward_min: -34.489986266107266
  episodes_this_iter: 2
  episodes_total: 76
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 7.450580707946131e-10
          cur_lr: 9.999999747378752e-05
          entropy: 1.3860942125320435
          entropy_coeff: 0.0
          kl: 7.276671567524318e-06
          model: {}
          policy_loss: 0.3426475524902344
          total_loss: 4.345736980438232
          vf_explained_var: -5.410115045378916e-05


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,29,115.368,232,-0.0183006,34.4903,-34.49,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,29,115.368,232,-0.0183006,34.4903,-34.49,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 240
  counters:
    num_agent_steps_sampled: 240
    num_agent_steps_trained: 240
    num_env_steps_sampled: 240
    num_env_steps_trained: 240
  custom_metrics: {}
  date: 2022-07-05_15-27-21
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.490288723481925
  episode_reward_mean: 0.39073733317345666
  episode_reward_min: -34.489986266107266
  episodes_this_iter: 4
  episodes_total: 80
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 1.3880605763294587
    episode_reward_mean: 1.3880605763294587
    episode_reward_min: 1.3880605763294587
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 1.3880605763294587
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0.14877

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,32,126.977,256,0.764423,34.4903,-34.49,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 272
  counters:
    num_agent_steps_sampled: 272
    num_agent_steps_trained: 272
    num_env_steps_sampled: 272
    num_env_steps_trained: 272
  custom_metrics: {}
  date: 2022-07-05_15-27-34
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.490288723481925
  episode_reward_mean: 0.33746636267994073
  episode_reward_min: -34.489986266107266
  episodes_this_iter: 2
  episodes_total: 90
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 2.3283064712331658e-11
          cur_lr: 9.999999747378752e-05
          entropy: 1.3859444856643677
          entropy_coeff: 0.0
          kl: 2.2294666450761724e-06
          model: {}
          policy_loss: -0.4746439754962921
          total_loss: 3.5254623889923096
          vf_explained_var: -0.011364527978003025

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,34,133.378,272,0.337466,34.4903,-34.49,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,34,133.378,272,0.337466,34.4903,-34.49,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 280
  counters:
    num_agent_steps_sampled: 280
    num_agent_steps_trained: 280
    num_env_steps_sampled: 280
    num_env_steps_trained: 280
  custom_metrics: {}
  date: 2022-07-05_15-27-40
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.490288723481925
  episode_reward_mean: 0.2805456278363427
  episode_reward_min: -34.489986266107266
  episodes_this_iter: 2
  episodes_total: 92
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -1.388075453575006
    episode_reward_mean: -1.388075453575006
    episode_reward_min: -1.388075453575006
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -1.388075453575006
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0.151146

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,37,147.135,296,-0.0148694,34.4903,-34.49,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 312
  counters:
    num_agent_steps_sampled: 312
    num_agent_steps_trained: 312
    num_env_steps_sampled: 312
    num_env_steps_trained: 312
  custom_metrics: {}
  date: 2022-07-05_15-27-55
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.11691174532054
  episode_reward_mean: -0.30161280756486086
  episode_reward_min: -35.118821415117345
  episodes_this_iter: 4
  episodes_total: 104
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 7.275957722603643e-13
          cur_lr: 9.999999747378752e-05
          entropy: 1.385935664176941
          entropy_coeff: 0.0
          kl: 2.6449085908097913e-06
          model: {}
          policy_loss: -0.5318413972854614
          total_loss: 1.4687069654464722
          vf_explained_var: 0.00035844644298776984

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,39,154.745,312,-0.301613,35.1169,-35.1188,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,39,154.745,312,-0.301613,35.1169,-35.1188,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 320
  counters:
    num_agent_steps_sampled: 320
    num_agent_steps_trained: 320
    num_env_steps_sampled: 320
    num_env_steps_trained: 320
  custom_metrics: {}
  date: 2022-07-05_15-28-01
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.11691174532054
  episode_reward_mean: -0.26363971903339284
  episode_reward_min: -35.118821415117345
  episodes_this_iter: 2
  episodes_total: 106
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.00011138290769419079
    episode_reward_mean: 0.00011138290769419079
    episode_reward_min: 0.00011138290769419079
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.00011138290769419079
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_proce

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,42,168.101,336,-0.534409,35.1169,-35.1188,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 352
  counters:
    num_agent_steps_sampled: 352
    num_agent_steps_trained: 352
    num_env_steps_sampled: 352
    num_env_steps_trained: 352
  custom_metrics: {}
  date: 2022-07-05_15-28-17
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.11691174532054
  episode_reward_mean: -0.21125787459346784
  episode_reward_min: -35.118821415117345
  episodes_this_iter: 2
  episodes_total: 116
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 2.2737367883136385e-14
          cur_lr: 9.999999747378752e-05
          entropy: 1.3861244916915894
          entropy_coeff: 0.0
          kl: 2.5635310976213077e-06
          model: {}
          policy_loss: -0.10793797671794891
          total_loss: -0.10580019652843475
          vf_explained_var: -0.09408716112375

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,44,176.347,352,-0.211258,35.1169,-35.1188,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,44,176.347,352,-0.211258,35.1169,-35.1188,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 360
  counters:
    num_agent_steps_sampled: 360
    num_agent_steps_trained: 360
    num_env_steps_sampled: 360
    num_env_steps_trained: 360
  custom_metrics: {}
  date: 2022-07-05_15-28-24
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.11691174532054
  episode_reward_mean: 0.014454484847235922
  episode_reward_min: -35.118821415117345
  episodes_this_iter: 4
  episodes_total: 120
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.006045475704037906
    episode_reward_mean: 0.006045475704037906
    episode_reward_min: 0.006045475704037906
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.006045475704037906
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,46,186.375,368,0.246841,35.1169,-35.1188,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 376
  counters:
    num_agent_steps_sampled: 376
    num_agent_steps_trained: 376
    num_env_steps_sampled: 376
    num_env_steps_trained: 376
  custom_metrics: {}
  date: 2022-07-05_15-28-31
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.11691174532054
  episode_reward_mean: 0.279763930005682
  episode_reward_min: -35.118821415117345
  episodes_this_iter: 2
  episodes_total: 124
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 2.842170985392048e-15
          cur_lr: 9.999999747378752e-05
          entropy: 1.3860105276107788
          entropy_coeff: 0.0
          kl: 4.47606453235494e-06
          model: {}
          policy_loss: 0.43855544924736023
          total_loss: 4.439111232757568
          vf_explained_var: -0.0007113814353942871
    

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,48,193.52,384,0.334771,35.1169,-35.1188,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 392
  counters:
    num_agent_steps_sampled: 392
    num_agent_steps_trained: 392
    num_env_steps_sampled: 392
    num_env_steps_trained: 392
  custom_metrics: {}
  date: 2022-07-05_15-28-38
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.11691174532054
  episode_reward_mean: -0.07416442378132787
  episode_reward_min: -35.118821415117345
  episodes_this_iter: 2
  episodes_total: 130
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 7.10542746348012e-16
          cur_lr: 9.999999747378752e-05
          entropy: 1.3858932256698608
          entropy_coeff: 0.0
          kl: 9.741040685184998e-07
          model: {}
          policy_loss: 0.2999935746192932
          total_loss: 4.300436019897461
          vf_explained_var: -0.0016021688934415579
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,49,197.337,392,-0.0741644,35.1169,-35.1188,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 400
  counters:
    num_agent_steps_sampled: 400
    num_agent_steps_trained: 400
    num_env_steps_sampled: 400
    num_env_steps_trained: 400
  custom_metrics: {}
  date: 2022-07-05_15-28-44
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.11691174532054
  episode_reward_mean: -0.008053343287597325
  episode_reward_min: -35.118821415117345
  episodes_this_iter: 2
  episodes_total: 132
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.08074798660062621
    episode_reward_mean: 0.08074798660062621
    episode_reward_min: 0.08074798660062621
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.08074798660062621
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,52,207.874,416,0.794367,39.4073,-35.1188,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 432
  counters:
    num_agent_steps_sampled: 432
    num_agent_steps_trained: 432
    num_env_steps_sampled: 432
    num_env_steps_trained: 432
  custom_metrics: {}
  date: 2022-07-05_15-28-57
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.4073055421964
  episode_reward_mean: 0.2904721791812971
  episode_reward_min: -39.56517672311935
  episodes_this_iter: 4
  episodes_total: 144
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 2.2204460823375376e-17
          cur_lr: 9.999999747378752e-05
          entropy: 1.3854669332504272
          entropy_coeff: 0.0
          kl: 4.511838596954476e-06
          model: {}
          policy_loss: -0.18849046528339386
          total_loss: 3.8125030994415283
          vf_explained_var: -0.0013168136356398463
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,54,215.804,432,0.290472,39.4073,-39.5652,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,54,215.804,432,0.290472,39.4073,-39.5652,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 440
  counters:
    num_agent_steps_sampled: 440
    num_agent_steps_trained: 440
    num_env_steps_sampled: 440
    num_env_steps_trained: 440
  custom_metrics: {}
  date: 2022-07-05_15-29-03
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.4073055421964
  episode_reward_mean: 0.014896529851941658
  episode_reward_min: -39.56517672311935
  episodes_this_iter: 2
  episodes_total: 146
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 28.88762923253877
    episode_reward_mean: 28.88762923253877
    episode_reward_min: 28.88762923253877
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 28.88762923253877
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0.1465012045

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,57,229.971,456,0.347141,39.4073,-39.5652,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 472
  counters:
    num_agent_steps_sampled: 472
    num_agent_steps_trained: 472
    num_env_steps_sampled: 472
    num_env_steps_trained: 472
  custom_metrics: {}
  date: 2022-07-05_15-29-20
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.4073055421964
  episode_reward_mean: 0.002389849332891254
  episode_reward_min: -39.56517672311935
  episodes_this_iter: 2
  episodes_total: 156
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 6.938894007304805e-19
          cur_lr: 9.999999747378752e-05
          entropy: 1.3851624727249146
          entropy_coeff: 0.0
          kl: 7.332072073040763e-06
          model: {}
          policy_loss: 0.5276077389717102
          total_loss: 0.587431013584137
          vf_explained_var: -0.02383425645530224
     

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,59,238.119,472,0.00238985,39.4073,-39.5652,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,59,238.119,472,0.00238985,39.4073,-39.5652,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 480
  counters:
    num_agent_steps_sampled: 480
    num_agent_steps_trained: 480
    num_env_steps_sampled: 480
    num_env_steps_trained: 480
  custom_metrics: {}
  date: 2022-07-05_15-29-26
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.4073055421964
  episode_reward_mean: 0.06571429820922296
  episode_reward_min: -39.56517672311935
  episodes_this_iter: 4
  episodes_total: 160
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -5.706423848562427
    episode_reward_mean: -5.706423848562427
    episode_reward_min: -5.706423848562427
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -5.706423848562427
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0.1464727

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,61,248.177,488,-0.0472042,39.4073,-39.5652,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 496
  counters:
    num_agent_steps_sampled: 496
    num_agent_steps_trained: 496
    num_env_steps_sampled: 496
    num_env_steps_trained: 496
  custom_metrics: {}
  date: 2022-07-05_15-29-33
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.4073055421964
  episode_reward_mean: 0.0505697688704388
  episode_reward_min: -39.56517672311935
  episodes_this_iter: 2
  episodes_total: 164
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 8.673617509131006e-20
          cur_lr: 9.999999747378752e-05
          entropy: 1.3856850862503052
          entropy_coeff: 0.0
          kl: 6.346278382807213e-07
          model: {}
          policy_loss: -0.5738440752029419
          total_loss: -0.5069561004638672
          vf_explained_var: 0.019644299522042274
    

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,63,254.926,504,-0.263601,39.4073,-39.5652,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 512
  counters:
    num_agent_steps_sampled: 512
    num_agent_steps_trained: 512
    num_env_steps_sampled: 512
    num_env_steps_trained: 512
  custom_metrics: {}
  date: 2022-07-05_15-29-40
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.4073055421964
  episode_reward_mean: 0.12286497028483315
  episode_reward_min: -39.56517672311935
  episodes_this_iter: 2
  episodes_total: 170
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 2.1684043772827515e-20
          cur_lr: 9.999999747378752e-05
          entropy: 1.3855855464935303
          entropy_coeff: 0.0
          kl: 8.205234735214617e-06
          model: {}
          policy_loss: -0.2962568402290344
          total_loss: 3.703948497772217
          vf_explained_var: -0.001406852388754487
   

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,64,258.111,512,0.122865,39.4073,-39.5652,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 520
  counters:
    num_agent_steps_sampled: 520
    num_agent_steps_trained: 520
    num_env_steps_sampled: 520
    num_env_steps_trained: 520
  custom_metrics: {}
  date: 2022-07-05_15-29-46
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.4073055421964
  episode_reward_mean: 0.10074491704155036
  episode_reward_min: -39.56517672311935
  episodes_this_iter: 2
  episodes_total: 172
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -23.267039715355118
    episode_reward_mean: -23.267039715355118
    episode_reward_min: -23.267039715355118
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -23.267039715355118
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0.145

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,67,269.925,536,0.644229,39.4073,-39.5652,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 552
  counters:
    num_agent_steps_sampled: 552
    num_agent_steps_trained: 552
    num_env_steps_sampled: 552
    num_env_steps_trained: 552
  custom_metrics: {}
  date: 2022-07-05_15-29-57
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.4073055421964
  episode_reward_mean: -0.42325612644007665
  episode_reward_min: -39.56517672311935
  episodes_this_iter: 4
  episodes_total: 184
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 6.776263679008599e-22
          cur_lr: 9.999999747378752e-05
          entropy: 1.3855211734771729
          entropy_coeff: 0.0
          kl: 1.0570239282969851e-05
          model: {}
          policy_loss: -0.523387610912323
          total_loss: 5.097594738006592
          vf_explained_var: -0.0013224601279944181
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,69,275.243,552,-0.423256,39.4073,-39.5652,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,69,275.243,552,-0.423256,39.4073,-39.5652,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 560
  counters:
    num_agent_steps_sampled: 560
    num_agent_steps_trained: 560
    num_env_steps_sampled: 560
    num_env_steps_trained: 560
  custom_metrics: {}
  date: 2022-07-05_15-30-03
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.4073055421964
  episode_reward_mean: 0.22500711983016558
  episode_reward_min: -39.56517672311935
  episodes_this_iter: 2
  episodes_total: 186
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.09089218064208293
    episode_reward_mean: 0.09089218064208293
    episode_reward_min: 0.09089218064208293
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.09089218064208293
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0.145

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,72,287.294,576,-0.286151,39.4073,-39.5652,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 592
  counters:
    num_agent_steps_sampled: 592
    num_agent_steps_trained: 592
    num_env_steps_sampled: 592
    num_env_steps_trained: 592
  custom_metrics: {}
  date: 2022-07-05_15-30-17
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.4073055421964
  episode_reward_mean: -0.2116147690392779
  episode_reward_min: -39.56517672311935
  episodes_this_iter: 2
  episodes_total: 196
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 2.117582399690187e-23
          cur_lr: 9.999999747378752e-05
          entropy: 1.3843411207199097
          entropy_coeff: 0.0
          kl: 6.884315553179476e-06
          model: {}
          policy_loss: 0.09582927823066711
          total_loss: 0.09906155616044998
          vf_explained_var: -1.0
          vf_loss: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,74,294.689,592,-0.211615,39.4073,-39.5652,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,74,294.689,592,-0.211615,39.4073,-39.5652,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 600
  counters:
    num_agent_steps_sampled: 600
    num_agent_steps_trained: 600
    num_env_steps_sampled: 600
    num_env_steps_trained: 600
  custom_metrics: {}
  date: 2022-07-05_15-30-24
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.4073055421964
  episode_reward_mean: -0.3515701978210245
  episode_reward_min: -39.56517672311935
  episodes_this_iter: 4
  episodes_total: 200
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.0005383608557222308
    episode_reward_mean: 0.0005383608557222308
    episode_reward_min: 0.0005383608557222308
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.0005383608557222308
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_m

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,76,304.966,608,-0.350926,39.4073,-39.5652,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 616
  counters:
    num_agent_steps_sampled: 616
    num_agent_steps_trained: 616
    num_env_steps_sampled: 616
    num_env_steps_trained: 616
  custom_metrics: {}
  date: 2022-07-05_15-30-31
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.4073055421964
  episode_reward_mean: 0.000347765342747145
  episode_reward_min: -39.56517672311935
  episodes_this_iter: 2
  episodes_total: 204
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 2.646977999612734e-24
          cur_lr: 9.999999747378752e-05
          entropy: 1.384734869003296
          entropy_coeff: 0.0
          kl: 1.0346581802878063e-05
          model: {}
          policy_loss: 0.22108988463878632
          total_loss: 2.2246930599212646
          vf_explained_var: 0.0033480802085250616
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,78,312.317,624,-0.000481844,39.4073,-39.5652,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 632
  counters:
    num_agent_steps_sampled: 632
    num_agent_steps_trained: 632
    num_env_steps_sampled: 632
    num_env_steps_trained: 632
  custom_metrics: {}
  date: 2022-07-05_15-30-38
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.4073055421964
  episode_reward_mean: -0.0004548826650939308
  episode_reward_min: -39.56517672311935
  episodes_this_iter: 2
  episodes_total: 210
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 6.617444999031835e-25
          cur_lr: 9.999999747378752e-05
          entropy: 1.385851502418518
          entropy_coeff: 0.0
          kl: 2.388401981079369e-06
          model: {}
          policy_loss: 0.18791532516479492
          total_loss: 0.5785229206085205
          vf_explained_var: -0.01729259081184864
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,79,316.139,632,-0.000454883,39.4073,-39.5652,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 640
  counters:
    num_agent_steps_sampled: 640
    num_agent_steps_trained: 640
    num_env_steps_sampled: 640
    num_env_steps_trained: 640
  custom_metrics: {}
  date: 2022-07-05_15-30-45
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.4073055421964
  episode_reward_mean: -0.01325255530051971
  episode_reward_min: -39.56517672311935
  episodes_this_iter: 2
  episodes_total: 212
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.000972798086700366
    episode_reward_mean: 0.000972798086700366
    episode_reward_min: 0.000972798086700366
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.000972798086700366
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,81,327.405,648,0.280072,39.4073,-39.5652,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 656
  counters:
    num_agent_steps_sampled: 656
    num_agent_steps_trained: 656
    num_env_steps_sampled: 656
    num_env_steps_trained: 656
  custom_metrics: {}
  date: 2022-07-05_15-30-53
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.4073055421964
  episode_reward_mean: 0.3409824761954974
  episode_reward_min: -39.56517672311935
  episodes_this_iter: 2
  episodes_total: 218
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 8.271806248789793e-26
          cur_lr: 9.999999747378752e-05
          entropy: 1.3851786851882935
          entropy_coeff: 0.0
          kl: 1.761790736054536e-05
          model: {}
          policy_loss: -0.4380095303058624
          total_loss: 4.149560928344727
          vf_explained_var: 0.013327368535101414
      

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,83,334.477,664,-0.000143498,39.4073,-39.5652,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 672
  counters:
    num_agent_steps_sampled: 672
    num_agent_steps_trained: 672
    num_env_steps_sampled: 672
    num_env_steps_trained: 672
  custom_metrics: {}
  date: 2022-07-05_15-31-01
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.4073055421964
  episode_reward_mean: -0.27944580508311256
  episode_reward_min: -39.56517672311935
  episodes_this_iter: 4
  episodes_total: 224
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 2.0679515621974483e-26
          cur_lr: 9.999999747378752e-05
          entropy: 1.384314775466919
          entropy_coeff: 0.0
          kl: 1.213568975799717e-05
          model: {}
          policy_loss: 0.2244318425655365
          total_loss: 2.239877939224243
          vf_explained_var: 0.0008020102977752686
    

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,84,338.159,672,-0.279446,39.4073,-39.5652,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 680
  counters:
    num_agent_steps_sampled: 680
    num_agent_steps_trained: 680
    num_env_steps_sampled: 680
    num_env_steps_trained: 680
  custom_metrics: {}
  date: 2022-07-05_15-31-08
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.4073055421964
  episode_reward_mean: -0.29337707715728806
  episode_reward_min: -39.56517672311935
  episodes_this_iter: 2
  episodes_total: 226
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.00040541113245573523
    episode_reward_mean: 0.00040541113245573523
    episode_reward_min: 0.00040541113245573523
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.00040541113245573523
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_process

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,86,348.816,688,-0.348602,39.4073,-39.5652,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 696
  counters:
    num_agent_steps_sampled: 696
    num_agent_steps_trained: 696
    num_env_steps_sampled: 696
    num_env_steps_trained: 696
  custom_metrics: {}
  date: 2022-07-05_15-31-15
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.4073055421964
  episode_reward_mean: 0.03244444666341829
  episode_reward_min: -39.56517672311935
  episodes_this_iter: 4
  episodes_total: 232
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 2.5849394527468104e-27
          cur_lr: 9.999999747378752e-05
          entropy: 1.3857172727584839
          entropy_coeff: 0.0
          kl: 1.7314083606834174e-06
          model: {}
          policy_loss: 0.16481517255306244
          total_loss: 2.167149305343628
          vf_explained_var: -0.0013069987762719393
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,88,355.3,704,-0.31354,38.6885,-39.5652,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 712
  counters:
    num_agent_steps_sampled: 712
    num_agent_steps_trained: 712
    num_env_steps_sampled: 712
    num_env_steps_trained: 712
  custom_metrics: {}
  date: 2022-07-05_15-31-21
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.688538726889576
  episode_reward_mean: -0.2516088373843116
  episode_reward_min: -39.56517672311935
  episodes_this_iter: 2
  episodes_total: 236
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 6.462348631867026e-28
          cur_lr: 9.999999747378752e-05
          entropy: 1.3847732543945312
          entropy_coeff: 0.0
          kl: 5.2936711654183455e-06
          model: {}
          policy_loss: -0.5026589035987854
          total_loss: 3.843308448791504
          vf_explained_var: 0.0008262813207693398
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,89,358.366,712,-0.251609,38.6885,-39.5652,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 720
  counters:
    num_agent_steps_sampled: 720
    num_agent_steps_trained: 720
    num_env_steps_sampled: 720
    num_env_steps_trained: 720
  custom_metrics: {}
  date: 2022-07-05_15-31-28
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.688538726889576
  episode_reward_mean: -0.3860259636646004
  episode_reward_min: -39.50620098288872
  episodes_this_iter: 4
  episodes_total: 240
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.0021375574480978488
    episode_reward_mean: -0.0021375574480978488
    episode_reward_min: -0.0021375574480978488
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.0021375574480978488
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_proces

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,91,369.041,728,-0.400408,38.6885,-39.5062,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 736
  counters:
    num_agent_steps_sampled: 736
    num_agent_steps_trained: 736
    num_env_steps_sampled: 736
    num_env_steps_trained: 736
  custom_metrics: {}
  date: 2022-07-05_15-31-36
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.688538726889576
  episode_reward_mean: -0.29077606908860465
  episode_reward_min: -39.50620098288872
  episodes_this_iter: 2
  episodes_total: 244
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 8.077935789833782e-29
          cur_lr: 9.999999747378752e-05
          entropy: 1.38528573513031
          entropy_coeff: 0.0
          kl: 2.0939055502822157e-06
          model: {}
          policy_loss: 0.1629982888698578
          total_loss: 2.1659553050994873
          vf_explained_var: 0.0031259774696081877
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,93,377.475,744,0.375651,39.1038,-39.5062,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 752
  counters:
    num_agent_steps_sampled: 752
    num_agent_steps_trained: 752
    num_env_steps_sampled: 752
    num_env_steps_trained: 752
  custom_metrics: {}
  date: 2022-07-05_15-31-44
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.10377045214865
  episode_reward_mean: 0.3809297712576688
  episode_reward_min: -39.50620098288872
  episodes_this_iter: 2
  episodes_total: 250
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 2.0194839474584456e-29
          cur_lr: 9.999999747378752e-05
          entropy: 1.3856669664382935
          entropy_coeff: 0.0
          kl: 8.782254553807434e-06
          model: {}
          policy_loss: -0.2550975978374481
          total_loss: 6.359776020050049
          vf_explained_var: 0.0017148733604699373
   

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,94,380.676,752,0.38093,39.1038,-39.5062,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 760
  counters:
    num_agent_steps_sampled: 760
    num_agent_steps_trained: 760
    num_env_steps_sampled: 760
    num_env_steps_trained: 760
  custom_metrics: {}
  date: 2022-07-05_15-31-51
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.10377045214865
  episode_reward_mean: -0.34586721235130624
  episode_reward_min: -39.50620098288872
  episodes_this_iter: 2
  episodes_total: 252
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.003743676432123255
    episode_reward_mean: 0.003743676432123255
    episode_reward_min: 0.003743676432123255
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.003743676432123255
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,96,391.907,768,0.098136,39.1038,-39.5062,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 776
  counters:
    num_agent_steps_sampled: 776
    num_agent_steps_trained: 776
    num_env_steps_sampled: 776
    num_env_steps_trained: 776
  custom_metrics: {}
  date: 2022-07-05_15-31-59
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.10377045214865
  episode_reward_mean: 0.0114861959176692
  episode_reward_min: -39.50620098288872
  episodes_this_iter: 2
  episodes_total: 258
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 2.524354934323057e-30
          cur_lr: 9.999999747378752e-05
          entropy: 1.385690689086914
          entropy_coeff: 0.0
          kl: 1.2765560768457362e-06
          model: {}
          policy_loss: -0.16235855221748352
          total_loss: -0.1217077448964119
          vf_explained_var: 1.7881394143159923e-08


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,98,399.181,784,-0.0483815,39.1038,-39.5062,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 792
  counters:
    num_agent_steps_sampled: 792
    num_agent_steps_trained: 792
    num_env_steps_sampled: 792
    num_env_steps_trained: 792
  custom_metrics: {}
  date: 2022-07-05_15-32-06
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.10377045214865
  episode_reward_mean: 0.04627884414191613
  episode_reward_min: -39.50620098288872
  episodes_this_iter: 4
  episodes_total: 264
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 6.3108873358076425e-31
          cur_lr: 9.999999747378752e-05
          entropy: 1.3853449821472168
          entropy_coeff: 0.0
          kl: 2.185408220611862e-06
          model: {}
          policy_loss: 0.38183578848838806
          total_loss: 4.382559776306152
          vf_explained_var: -0.00421886844560504
   

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,99,402.863,792,0.0462788,39.1038,-39.5062,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 800
  counters:
    num_agent_steps_sampled: 800
    num_agent_steps_trained: 800
    num_env_steps_sampled: 800
    num_env_steps_trained: 800
  custom_metrics: {}
  date: 2022-07-05_15-32-13
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.10377045214865
  episode_reward_mean: 0.04683016735673588
  episode_reward_min: -39.50620098288872
  episodes_this_iter: 2
  episodes_total: 266
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.00032014742736463653
    episode_reward_mean: 0.00032014742736463653
    episode_reward_min: 0.00032014742736463653
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.00032014742736463653
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_process

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,102,414.864,816,0.259379,39.1038,-39.5062,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 832
  counters:
    num_agent_steps_sampled: 832
    num_agent_steps_trained: 832
    num_env_steps_sampled: 832
    num_env_steps_trained: 832
  custom_metrics: {}
  date: 2022-07-05_15-32-25
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.10377045214865
  episode_reward_mean: -0.2696640962908772
  episode_reward_min: -39.50620098288872
  episodes_this_iter: 2
  episodes_total: 276
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.9721522924398883e-32
          cur_lr: 9.999999747378752e-05
          entropy: 1.385023593902588
          entropy_coeff: 0.0
          kl: 3.228035666325013e-06
          model: {}
          policy_loss: -0.6011038422584534
          total_loss: 5.3993754386901855
          vf_explained_var: -0.007451708894222975
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,104,421.248,832,-0.269664,39.1038,-39.5062,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,104,421.248,832,-0.269664,39.1038,-39.5062,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 840
  counters:
    num_agent_steps_sampled: 840
    num_agent_steps_trained: 840
    num_env_steps_sampled: 840
    num_env_steps_trained: 840
  custom_metrics: {}
  date: 2022-07-05_15-32-30
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.10377045214865
  episode_reward_mean: -0.2487974399230081
  episode_reward_min: -39.50620098288872
  episodes_this_iter: 4
  episodes_total: 280
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.007466559791082572
    episode_reward_mean: -0.007466559791082572
    episode_reward_min: -0.007466559791082572
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.007466559791082572
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,107,432.546,856,0.442048,39.1038,-39.5062,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 872
  counters:
    num_agent_steps_sampled: 872
    num_agent_steps_trained: 872
    num_env_steps_sampled: 872
    num_env_steps_trained: 872
  custom_metrics: {}
  date: 2022-07-05_15-32-42
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.10377045214865
  episode_reward_mean: -0.05379647445831992
  episode_reward_min: -39.465538815414156
  episodes_this_iter: 2
  episodes_total: 290
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 6.162975913874651e-34
          cur_lr: 9.999999747378752e-05
          entropy: 1.384265661239624
          entropy_coeff: 0.0
          kl: 6.254780146264238e-06
          model: {}
          policy_loss: 0.2936437427997589
          total_loss: 5.310583114624023
          vf_explained_var: 0.008448241278529167
    

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,109,438.367,872,-0.0537965,39.1038,-39.4655,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,109,438.367,872,-0.0537965,39.1038,-39.4655,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 880
  counters:
    num_agent_steps_sampled: 880
    num_agent_steps_trained: 880
    num_env_steps_sampled: 880
    num_env_steps_trained: 880
  custom_metrics: {}
  date: 2022-07-05_15-32-49
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.10377045214865
  episode_reward_mean: 0.675677348959817
  episode_reward_min: -39.465538815414156
  episodes_this_iter: 2
  episodes_total: 292
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.006710444063893872
    episode_reward_mean: 0.006710444063893872
    episode_reward_min: 0.006710444063893872
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.006710444063893872
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,112,449.884,896,0.525853,39.1038,-39.4655,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 912
  counters:
    num_agent_steps_sampled: 912
    num_agent_steps_trained: 912
    num_env_steps_sampled: 912
    num_env_steps_trained: 912
  custom_metrics: {}
  date: 2022-07-05_15-33-00
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.10377045214865
  episode_reward_mean: -0.0003007447146730635
  episode_reward_min: -39.465538815414156
  episodes_this_iter: 4
  episodes_total: 304
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.9259299730858284e-35
          cur_lr: 9.999999747378752e-05
          entropy: 1.3844172954559326
          entropy_coeff: 0.0
          kl: 1.0178458978771232e-05
          model: {}
          policy_loss: 0.05820029601454735
          total_loss: 6.0608439445495605
          vf_explained_var: 0.0016584635013714

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,114,455.646,912,-0.000300745,39.1038,-39.4655,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,114,455.646,912,-0.000300745,39.1038,-39.4655,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 920
  counters:
    num_agent_steps_sampled: 920
    num_agent_steps_trained: 920
    num_env_steps_sampled: 920
    num_env_steps_trained: 920
  custom_metrics: {}
  date: 2022-07-05_15-33-05
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.10377045214865
  episode_reward_mean: 0.4859580234017989
  episode_reward_min: -39.465538815414156
  episodes_this_iter: 2
  episodes_total: 306
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 1.2984587937108258
    episode_reward_mean: 1.2984587937108258
    episode_reward_min: 1.2984587937108258
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 1.2984587937108258
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0.146913

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,117,466.127,936,0.613346,39.1038,-39.4655,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 944
  counters:
    num_agent_steps_sampled: 944
    num_agent_steps_trained: 944
    num_env_steps_sampled: 944
    num_env_steps_trained: 944
  custom_metrics: {}
  date: 2022-07-05_15-33-14
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.10377045214865
  episode_reward_mean: -0.2930233218861914
  episode_reward_min: -39.465538815414156
  episodes_this_iter: 2
  episodes_total: 314
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.2037062331786428e-36
          cur_lr: 9.999999747378752e-05
          entropy: 1.3842031955718994
          entropy_coeff: 0.0
          kl: 1.7990909327636473e-05
          model: {}
          policy_loss: 0.3397553861141205
          total_loss: 4.3583197593688965
          vf_explained_var: 0.0043681105598807335


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,119,473.474,952,-0.292635,39.1038,-39.4655,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,119,473.474,952,-0.292635,39.1038,-39.4655,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 960
  counters:
    num_agent_steps_sampled: 960
    num_agent_steps_trained: 960
    num_env_steps_sampled: 960
    num_env_steps_trained: 960
  custom_metrics: {}
  date: 2022-07-05_15-33-24
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.10377045214865
  episode_reward_mean: -0.013660952852258133
  episode_reward_min: -39.465538815414156
  episodes_this_iter: 4
  episodes_total: 320
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.004445921062639524
    episode_reward_mean: -0.004445921062639524
    episode_reward_min: -0.004445921062639524
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.004445921062639524
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processi

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,121,484.479,968,-0.013947,39.1038,-39.4655,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 976
  counters:
    num_agent_steps_sampled: 976
    num_agent_steps_trained: 976
    num_env_steps_sampled: 976
    num_env_steps_trained: 976
  custom_metrics: {}
  date: 2022-07-05_15-33-32
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.10377045214865
  episode_reward_mean: 0.0003869437157687605
  episode_reward_min: -39.465538815414156
  episodes_this_iter: 2
  episodes_total: 324
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 7.523163957366517e-38
          cur_lr: 9.999999747378752e-05
          entropy: 1.3850499391555786
          entropy_coeff: 0.0
          kl: 4.630405965144746e-05
          model: {}
          policy_loss: -0.24368511140346527
          total_loss: 1.7639400959014893
          vf_explained_var: 0.003024860285222530

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,123,490.887,984,0.269511,39.1038,-39.4655,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 992
  counters:
    num_agent_steps_sampled: 992
    num_agent_steps_trained: 992
    num_env_steps_sampled: 992
    num_env_steps_trained: 992
  custom_metrics: {}
  date: 2022-07-05_15-33-38
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.10377045214865
  episode_reward_mean: 0.25673613690846886
  episode_reward_min: -39.465538815414156
  episodes_this_iter: 2
  episodes_total: 330
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.8807909893416293e-38
          cur_lr: 9.999999747378752e-05
          entropy: 1.3836473226547241
          entropy_coeff: 0.0
          kl: 8.486261322104838e-06
          model: {}
          policy_loss: 0.38144752383232117
          total_loss: 7.44150972366333
          vf_explained_var: 0.005577270407229662
   

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,125,498.631,1000,0.262272,39.1038,-39.4655,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1016
  counters:
    num_agent_steps_sampled: 1016
    num_agent_steps_trained: 1016
    num_env_steps_sampled: 1016
    num_env_steps_trained: 1016
  custom_metrics: {}
  date: 2022-07-05_15-33-50
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.10377045214865
  episode_reward_mean: -0.11020564649302596
  episode_reward_min: -39.465538815414156
  episodes_this_iter: 2
  episodes_total: 338
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3842623233795166
          entropy_coeff: 0.0
          kl: 6.1413165894919075e-06
          model: {}
          policy_loss: 0.5494164228439331
          total_loss: 4.550925254821777
          vf_explained_var: -0.010136513039469719
          vf_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,127,505.394,1016,-0.110206,39.1038,-39.4655,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1032
  counters:
    num_agent_steps_sampled: 1032
    num_agent_steps_trained: 1032
    num_env_steps_sampled: 1032
    num_env_steps_trained: 1032
  custom_metrics: {}
  date: 2022-07-05_15-33-57
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.10377045214865
  episode_reward_mean: 0.3077563734926253
  episode_reward_min: -39.465538815414156
  episodes_this_iter: 4
  episodes_total: 344
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3848457336425781
          entropy_coeff: 0.0
          kl: 1.3260533933134866e-06
          model: {}
          policy_loss: -0.17058314383029938
          total_loss: 0.21442262828350067
          vf_explained_var: 0.07248704880475998
          vf_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,129,512.009,1032,0.307756,39.1038,-39.4655,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,129,512.009,1032,0.307756,39.1038,-39.4655,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1040
  counters:
    num_agent_steps_sampled: 1040
    num_agent_steps_trained: 1040
    num_env_steps_sampled: 1040
    num_env_steps_trained: 1040
  custom_metrics: {}
  date: 2022-07-05_15-34-04
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.57952533515323
  episode_reward_mean: -0.12860003103405807
  episode_reward_min: -39.465538815414156
  episodes_this_iter: 2
  episodes_total: 346
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.06230683048111341
    episode_reward_mean: -0.06230683048111341
    episode_reward_min: -0.06230683048111341
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.06230683048111341
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processi

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,131,522.981,1048,-0.0893415,38.5795,-39.4655,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1056
  counters:
    num_agent_steps_sampled: 1056
    num_agent_steps_trained: 1056
    num_env_steps_sampled: 1056
    num_env_steps_trained: 1056
  custom_metrics: {}
  date: 2022-07-05_15-34-11
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.57952533515323
  episode_reward_mean: 0.6427911323660048
  episode_reward_min: -38.43775633427545
  episodes_this_iter: 4
  episodes_total: 352
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3847283124923706
          entropy_coeff: 0.0
          kl: 6.304989256022964e-06
          model: {}
          policy_loss: -0.15893785655498505
          total_loss: 4.336456298828125
          vf_explained_var: 0.002164218807592988
          vf_loss

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,133,528.748,1064,0.239307,38.5795,-39.3638,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1072
  counters:
    num_agent_steps_sampled: 1072
    num_agent_steps_trained: 1072
    num_env_steps_sampled: 1072
    num_env_steps_trained: 1072
  custom_metrics: {}
  date: 2022-07-05_15-34-16
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.57952533515323
  episode_reward_mean: 0.2867637835475283
  episode_reward_min: -39.36378726006254
  episodes_this_iter: 2
  episodes_total: 356
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3851529359817505
          entropy_coeff: 0.0
          kl: 8.477209121338092e-06
          model: {}
          policy_loss: -0.1315872073173523
          total_loss: 2.0154361724853516
          vf_explained_var: 0.003321524476632476
          vf_loss

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,134,530.929,1072,0.286764,38.5795,-39.3638,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1080
  counters:
    num_agent_steps_sampled: 1080
    num_agent_steps_trained: 1080
    num_env_steps_sampled: 1080
    num_env_steps_trained: 1080
  custom_metrics: {}
  date: 2022-07-05_15-34-21
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.57952533515323
  episode_reward_mean: 0.07803779303418874
  episode_reward_min: -39.36378726006254
  episodes_this_iter: 4
  episodes_total: 360
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.06103859001091583
    episode_reward_mean: 0.06103859001091583
    episode_reward_min: 0.06103859001091583
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.06103859001091583
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,137,542.824,1096,0.00347743,38.5795,-39.3638,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1112
  counters:
    num_agent_steps_sampled: 1112
    num_agent_steps_trained: 1112
    num_env_steps_sampled: 1112
    num_env_steps_trained: 1112
  custom_metrics: {}
  date: 2022-07-05_15-34-35
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.57952533515323
  episode_reward_mean: -0.39661374729061727
  episode_reward_min: -39.36378726006254
  episodes_this_iter: 2
  episodes_total: 370
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3853185176849365
          entropy_coeff: 0.0
          kl: 3.221603265046724e-06
          model: {}
          policy_loss: 0.5086604952812195
          total_loss: 4.51775598526001
          vf_explained_var: 0.008103663101792336
          vf_loss:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,139,549.597,1112,-0.396614,38.5795,-39.3638,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,139,549.597,1112,-0.396614,38.5795,-39.3638,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1120
  counters:
    num_agent_steps_sampled: 1120
    num_agent_steps_trained: 1120
    num_env_steps_sampled: 1120
    num_env_steps_trained: 1120
  custom_metrics: {}
  date: 2022-07-05_15-34-40
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.57952533515323
  episode_reward_mean: -0.050346775109528966
  episode_reward_min: -39.36378726006254
  episodes_this_iter: 2
  episodes_total: 372
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.03357035873318992
    episode_reward_mean: -0.03357035873318992
    episode_reward_min: -0.03357035873318992
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.03357035873318992
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processi

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,142,560.383,1136,0.226812,38.5795,-39.3638,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1152
  counters:
    num_agent_steps_sampled: 1152
    num_agent_steps_trained: 1152
    num_env_steps_sampled: 1152
    num_env_steps_trained: 1152
  custom_metrics: {}
  date: 2022-07-05_15-34-53
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.19445079897323
  episode_reward_mean: -0.40637179397578327
  episode_reward_min: -39.36378726006254
  episodes_this_iter: 4
  episodes_total: 384
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3852202892303467
          entropy_coeff: 0.0
          kl: 1.302287500948296e-06
          model: {}
          policy_loss: 0.4026944041252136
          total_loss: 2.403886079788208
          vf_explained_var: 0.0037181635852903128
          vf_los

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,144,567.484,1152,-0.406372,38.1945,-39.3638,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,144,567.484,1152,-0.406372,38.1945,-39.3638,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1160
  counters:
    num_agent_steps_sampled: 1160
    num_agent_steps_trained: 1160
    num_env_steps_sampled: 1160
    num_env_steps_trained: 1160
  custom_metrics: {}
  date: 2022-07-05_15-35-00
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.19445079897323
  episode_reward_mean: -0.3843345646780393
  episode_reward_min: -39.36378726006254
  episodes_this_iter: 2
  episodes_total: 386
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.0038735660907911473
    episode_reward_mean: -0.0038735660907911473
    episode_reward_min: -0.0038735660907911473
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.0038735660907911473
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_pr

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,146,578.487,1168,-0.63974,38.1945,-39.3638,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1176
  counters:
    num_agent_steps_sampled: 1176
    num_agent_steps_trained: 1176
    num_env_steps_sampled: 1176
    num_env_steps_trained: 1176
  custom_metrics: {}
  date: 2022-07-05_15-35-07
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.19445079897323
  episode_reward_mean: -0.31794568978988624
  episode_reward_min: -39.36378726006254
  episodes_this_iter: 4
  episodes_total: 392
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3850247859954834
          entropy_coeff: 0.0
          kl: 1.780995035005617e-06
          model: {}
          policy_loss: 0.5735819935798645
          total_loss: 0.577116072177887
          vf_explained_var: -1.0
          vf_loss: 0.003534040180

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,148,585.695,1184,-0.679855,38.1945,-39.3638,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1192
  counters:
    num_agent_steps_sampled: 1192
    num_agent_steps_trained: 1192
    num_env_steps_sampled: 1192
    num_env_steps_trained: 1192
  custom_metrics: {}
  date: 2022-07-05_15-35-14
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.19445079897323
  episode_reward_mean: -0.2676784975371451
  episode_reward_min: -39.36378726006254
  episodes_this_iter: 2
  episodes_total: 396
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3839095830917358
          entropy_coeff: 0.0
          kl: 5.688213150278898e-06
          model: {}
          policy_loss: -0.42966222763061523
          total_loss: 3.8048243522644043
          vf_explained_var: 0.0011903762351721525
          vf_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,149,588.792,1192,-0.267678,38.1945,-39.3638,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1200
  counters:
    num_agent_steps_sampled: 1200
    num_agent_steps_trained: 1200
    num_env_steps_sampled: 1200
    num_env_steps_trained: 1200
  custom_metrics: {}
  date: 2022-07-05_15-35-21
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.19445079897323
  episode_reward_mean: -0.1539116879514922
  episode_reward_min: -39.36378726006254
  episodes_this_iter: 4
  episodes_total: 400
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.1252721674282582
    episode_reward_mean: 0.1252721674282582
    episode_reward_min: 0.1252721674282582
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.1252721674282582
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0.1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,152,600.858,1216,0.39454,38.1945,-39.3638,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1232
  counters:
    num_agent_steps_sampled: 1232
    num_agent_steps_trained: 1232
    num_env_steps_sampled: 1232
    num_env_steps_trained: 1232
  custom_metrics: {}
  date: 2022-07-05_15-35-32
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.19445079897323
  episode_reward_mean: -0.12039640018785633
  episode_reward_min: -39.36378726006254
  episodes_this_iter: 2
  episodes_total: 410
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3846625089645386
          entropy_coeff: 0.0
          kl: 4.545082902041031e-06
          model: {}
          policy_loss: 0.4288715422153473
          total_loss: 6.944626808166504
          vf_explained_var: -0.002152907894924283
          vf_los

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,154,606.212,1232,-0.120396,38.1945,-39.3638,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,154,606.212,1232,-0.120396,38.1945,-39.3638,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1240
  counters:
    num_agent_steps_sampled: 1240
    num_agent_steps_trained: 1240
    num_env_steps_sampled: 1240
    num_env_steps_trained: 1240
  custom_metrics: {}
  date: 2022-07-05_15-35-38
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.19445079897323
  episode_reward_mean: -0.31522601960687596
  episode_reward_min: -39.36378726006254
  episodes_this_iter: 2
  episodes_total: 412
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.13801043082796127
    episode_reward_mean: -0.13801043082796127
    episode_reward_min: -0.13801043082796127
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.13801043082796127
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processin

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,157,617.419,1256,0.392232,38.1945,-39.3638,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1272
  counters:
    num_agent_steps_sampled: 1272
    num_agent_steps_trained: 1272
    num_env_steps_sampled: 1272
    num_env_steps_trained: 1272
  custom_metrics: {}
  date: 2022-07-05_15-35-50
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.19445079897323
  episode_reward_mean: 0.011537924077588272
  episode_reward_min: -39.36378726006254
  episodes_this_iter: 4
  episodes_total: 424
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3853760957717896
          entropy_coeff: 0.0
          kl: 3.817832748609362e-06
          model: {}
          policy_loss: -0.005770206451416016
          total_loss: 5.99476432800293
          vf_explained_var: -0.0007847627275623381
          vf_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,159,624.222,1272,0.0115379,38.1945,-39.3638,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,159,624.222,1272,0.0115379,38.1945,-39.3638,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1280
  counters:
    num_agent_steps_sampled: 1280
    num_agent_steps_trained: 1280
    num_env_steps_sampled: 1280
    num_env_steps_trained: 1280
  custom_metrics: {}
  date: 2022-07-05_15-35-56
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.19445079897323
  episode_reward_mean: -0.25631684129244914
  episode_reward_min: -39.36378726006254
  episodes_this_iter: 2
  episodes_total: 426
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.08067512295721979
    episode_reward_mean: 0.08067512295721979
    episode_reward_min: 0.08067512295721979
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.08067512295721979
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,162,638.538,1296,-0.643121,38.1945,-39.3638,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1312
  counters:
    num_agent_steps_sampled: 1312
    num_agent_steps_trained: 1312
    num_env_steps_sampled: 1312
    num_env_steps_trained: 1312
  custom_metrics: {}
  date: 2022-07-05_15-36-12
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.19445079897323
  episode_reward_mean: -0.3255353094754638
  episode_reward_min: -39.36378726006254
  episodes_this_iter: 2
  episodes_total: 436
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3820347785949707
          entropy_coeff: 0.0
          kl: 7.352091870416189e-06
          model: {}
          policy_loss: 0.3718799650669098
          total_loss: 1.4749813079833984
          vf_explained_var: -0.0017499704845249653
          vf_lo

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,164,645.971,1312,-0.325535,38.1945,-39.3638,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,164,645.971,1312,-0.325535,38.1945,-39.3638,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1320
  counters:
    num_agent_steps_sampled: 1320
    num_agent_steps_trained: 1320
    num_env_steps_sampled: 1320
    num_env_steps_trained: 1320
  custom_metrics: {}
  date: 2022-07-05_15-36-18
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.19445079897323
  episode_reward_mean: -0.060334894676554986
  episode_reward_min: -39.36378726006254
  episodes_this_iter: 4
  episodes_total: 440
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.09126683711399575
    episode_reward_mean: -0.09126683711399575
    episode_reward_min: -0.09126683711399575
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.09126683711399575
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processi

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,167,659.593,1336,-0.308,38.1945,-39.3638,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1352
  counters:
    num_agent_steps_sampled: 1352
    num_agent_steps_trained: 1352
    num_env_steps_sampled: 1352
    num_env_steps_trained: 1352
  custom_metrics: {}
  date: 2022-07-05_15-36-34
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.19445079897323
  episode_reward_mean: -0.5380618295063286
  episode_reward_min: -39.36378726006254
  episodes_this_iter: 2
  episodes_total: 450
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3835406303405762
          entropy_coeff: 0.0
          kl: 5.055527253716718e-06
          model: {}
          policy_loss: -0.39685338735580444
          total_loss: -0.05189188942313194
          vf_explained_var: 0.01272120513021946
          vf_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,169,667.45,1352,-0.538062,38.1945,-39.3638,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,169,667.45,1352,-0.538062,38.1945,-39.3638,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1360
  counters:
    num_agent_steps_sampled: 1360
    num_agent_steps_trained: 1360
    num_env_steps_sampled: 1360
    num_env_steps_trained: 1360
  custom_metrics: {}
  date: 2022-07-05_15-36-40
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.19445079897323
  episode_reward_mean: -0.6571661017792867
  episode_reward_min: -39.36378726006254
  episodes_this_iter: 2
  episodes_total: 452
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -1.2554017398110475
    episode_reward_mean: -1.2554017398110475
    episode_reward_min: -1.2554017398110475
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -1.2554017398110475
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,171,678.139,1368,-0.666289,34.6828,-37.9676,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1376
  counters:
    num_agent_steps_sampled: 1376
    num_agent_steps_trained: 1376
    num_env_steps_sampled: 1376
    num_env_steps_trained: 1376
  custom_metrics: {}
  date: 2022-07-05_15-36-49
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.68275836071352
  episode_reward_mean: -0.38391541014375163
  episode_reward_min: -37.96764451149938
  episodes_this_iter: 2
  episodes_total: 458
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.38205885887146
          entropy_coeff: 0.0
          kl: 2.8302952159720007e-06
          model: {}
          policy_loss: -0.335783451795578
          total_loss: -0.33186644315719604
          vf_explained_var: -0.2599194347858429
          vf_los

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,173,685.487,1384,-0.392288,34.6828,-37.9676,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1392
  counters:
    num_agent_steps_sampled: 1392
    num_agent_steps_trained: 1392
    num_env_steps_sampled: 1392
    num_env_steps_trained: 1392
  custom_metrics: {}
  date: 2022-07-05_15-36-56
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.68275836071352
  episode_reward_mean: -0.0720649282930523
  episode_reward_min: -37.96764451149938
  episodes_this_iter: 4
  episodes_total: 464
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.382438063621521
          entropy_coeff: 0.0
          kl: 7.490955340472283e-06
          model: {}
          policy_loss: 0.026786400005221367
          total_loss: 0.06374058127403259
          vf_explained_var: 0.06911886483430862
          vf_los

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,174,689.211,1392,-0.0720649,34.6828,-37.9676,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1400
  counters:
    num_agent_steps_sampled: 1400
    num_agent_steps_trained: 1400
    num_env_steps_sampled: 1400
    num_env_steps_trained: 1400
  custom_metrics: {}
  date: 2022-07-05_15-37-02
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.68275836071352
  episode_reward_mean: 0.28060929581784794
  episode_reward_min: -37.96764451149938
  episodes_this_iter: 2
  episodes_total: 466
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 1.2577233719074066
    episode_reward_mean: 1.2577233719074066
    episode_reward_min: 1.2577233719074066
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 1.2577233719074066
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0.1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,177,700.702,1416,-0.332431,34.6828,-37.9676,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1432
  counters:
    num_agent_steps_sampled: 1432
    num_agent_steps_trained: 1432
    num_env_steps_sampled: 1432
    num_env_steps_trained: 1432
  custom_metrics: {}
  date: 2022-07-05_15-37-13
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.68275836071352
  episode_reward_mean: -0.10684243324711908
  episode_reward_min: -37.96764451149938
  episodes_this_iter: 2
  episodes_total: 476
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3859220743179321
          entropy_coeff: 0.0
          kl: 1.161091131507419e-05
          model: {}
          policy_loss: 0.33923524618148804
          total_loss: 4.5913004875183105
          vf_explained_var: -0.004134901333600283
          vf_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,179,705.977,1432,-0.106842,34.6828,-37.9676,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,179,705.977,1432,-0.106842,34.6828,-37.9676,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1440
  counters:
    num_agent_steps_sampled: 1440
    num_agent_steps_trained: 1440
    num_env_steps_sampled: 1440
    num_env_steps_trained: 1440
  custom_metrics: {}
  date: 2022-07-05_15-37-18
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.68275836071352
  episode_reward_mean: 0.6069769393056239
  episode_reward_min: -37.96764451149938
  episodes_this_iter: 4
  episodes_total: 480
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -1.3336508891277516
    episode_reward_mean: -1.3336508891277516
    episode_reward_min: -1.3336508891277516
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -1.3336508891277516
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,182,716.714,1456,0.35198,34.6828,-37.9676,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1472
  counters:
    num_agent_steps_sampled: 1472
    num_agent_steps_trained: 1472
    num_env_steps_sampled: 1472
    num_env_steps_trained: 1472
  custom_metrics: {}
  date: 2022-07-05_15-37-31
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.68275836071352
  episode_reward_mean: -0.0002147669860108925
  episode_reward_min: -39.82938783525944
  episodes_this_iter: 2
  episodes_total: 490
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3813174962997437
          entropy_coeff: 0.0
          kl: 1.2257104344826075e-06
          model: {}
          policy_loss: 0.37801113724708557
          total_loss: 0.3794020116329193
          vf_explained_var: -0.40156522393226624
          vf

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,184,723.454,1472,-0.000214767,34.6828,-39.8294,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,184,723.454,1472,-0.000214767,34.6828,-39.8294,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1480
  counters:
    num_agent_steps_sampled: 1480
    num_agent_steps_trained: 1480
    num_env_steps_sampled: 1480
    num_env_steps_trained: 1480
  custom_metrics: {}
  date: 2022-07-05_15-37-37
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.68275836071352
  episode_reward_mean: -0.10639271637728771
  episode_reward_min: -39.82938783525944
  episodes_this_iter: 2
  episodes_total: 492
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 1.3593500809931505
    episode_reward_mean: 1.3593500809931505
    episode_reward_min: 1.3593500809931505
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 1.3593500809931505
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,187,735.631,1496,-0.346712,34.0452,-39.8294,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1512
  counters:
    num_agent_steps_sampled: 1512
    num_agent_steps_trained: 1512
    num_env_steps_sampled: 1512
    num_env_steps_trained: 1512
  custom_metrics: {}
  date: 2022-07-05_15-37-51
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.045182605427826
  episode_reward_mean: -0.39407781121944857
  episode_reward_min: -39.82938783525944
  episodes_this_iter: 4
  episodes_total: 504
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.381507396697998
          entropy_coeff: 0.0
          kl: 4.118684501008829e-06
          model: {}
          policy_loss: -0.24340540170669556
          total_loss: -0.24226322770118713
          vf_explained_var: 0.1548745185136795
          vf_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,189,743.38,1512,-0.394078,34.0452,-39.8294,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,189,743.38,1512,-0.394078,34.0452,-39.8294,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1520
  counters:
    num_agent_steps_sampled: 1520
    num_agent_steps_trained: 1520
    num_env_steps_sampled: 1520
    num_env_steps_trained: 1520
  custom_metrics: {}
  date: 2022-07-05_15-37-57
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.045182605427826
  episode_reward_mean: -0.6706902897443893
  episode_reward_min: -39.82938783525944
  episodes_this_iter: 2
  episodes_total: 506
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.03726657519340426
    episode_reward_mean: 0.03726657519340426
    episode_reward_min: 0.03726657519340426
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.03726657519340426
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,191,754.151,1528,-0.553326,34.0452,-39.8294,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1536
  counters:
    num_agent_steps_sampled: 1536
    num_agent_steps_trained: 1536
    num_env_steps_sampled: 1536
    num_env_steps_trained: 1536
  custom_metrics: {}
  date: 2022-07-05_15-38-06
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.045182605427826
  episode_reward_mean: -0.2985314800829438
  episode_reward_min: -39.82938783525944
  episodes_this_iter: 4
  episodes_total: 512
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3849071264266968
          entropy_coeff: 0.0
          kl: 1.6640949979773723e-05
          model: {}
          policy_loss: 0.2470901757478714
          total_loss: 0.6101271510124207
          vf_explained_var: -0.00797365140169859
          vf_lo

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,193,762.626,1544,-0.241646,34.0452,-39.8294,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1552
  counters:
    num_agent_steps_sampled: 1552
    num_agent_steps_trained: 1552
    num_env_steps_sampled: 1552
    num_env_steps_trained: 1552
  custom_metrics: {}
  date: 2022-07-05_15-38-14
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.045182605427826
  episode_reward_mean: -0.5819104289499364
  episode_reward_min: -39.82938783525944
  episodes_this_iter: 2
  episodes_total: 516
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3856453895568848
          entropy_coeff: 0.0
          kl: 2.026987658609869e-06
          model: {}
          policy_loss: 0.3716532289981842
          total_loss: 0.3722209334373474
          vf_explained_var: -0.7954835891723633
          vf_loss

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,194,766.461,1552,-0.58191,34.0452,-39.8294,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1560
  counters:
    num_agent_steps_sampled: 1560
    num_agent_steps_trained: 1560
    num_env_steps_sampled: 1560
    num_env_steps_trained: 1560
  custom_metrics: {}
  date: 2022-07-05_15-38-20
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.045182605427826
  episode_reward_mean: -0.013282811135988597
  episode_reward_min: -39.82938783525944
  episodes_this_iter: 4
  episodes_total: 520
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.008024736862237258
    episode_reward_mean: -0.008024736862237258
    episode_reward_min: -0.008024736862237258
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.008024736862237258
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_pro

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,196,776.554,1568,-0.0256683,34.0452,-39.8294,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1576
  counters:
    num_agent_steps_sampled: 1576
    num_agent_steps_trained: 1576
    num_env_steps_sampled: 1576
    num_env_steps_trained: 1576
  custom_metrics: {}
  date: 2022-07-05_15-38-28
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.045182605427826
  episode_reward_mean: -0.02523136253444823
  episode_reward_min: -39.82938783525944
  episodes_this_iter: 2
  episodes_total: 524
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.382055401802063
          entropy_coeff: 0.0
          kl: 1.087048713088734e-05
          model: {}
          policy_loss: -0.26785457134246826
          total_loss: -0.26683560013771057
          vf_explained_var: -0.6305055022239685
          vf_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,198,784.277,1584,-0.0214841,34.0452,-39.8294,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1592
  counters:
    num_agent_steps_sampled: 1592
    num_agent_steps_trained: 1592
    num_env_steps_sampled: 1592
    num_env_steps_trained: 1592
  custom_metrics: {}
  date: 2022-07-05_15-38-35
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.47754320835136
  episode_reward_mean: 0.6475719111040273
  episode_reward_min: -39.82938783525944
  episodes_this_iter: 2
  episodes_total: 530
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.381934404373169
          entropy_coeff: 0.0
          kl: 6.583081244571076e-07
          model: {}
          policy_loss: -0.026127682998776436
          total_loss: 1.312961459159851
          vf_explained_var: -0.02106560952961445
          vf_loss

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,199,787.077,1592,0.647572,35.4775,-39.8294,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1600
  counters:
    num_agent_steps_sampled: 1600
    num_agent_steps_trained: 1600
    num_env_steps_sampled: 1600
    num_env_steps_trained: 1600
  custom_metrics: {}
  date: 2022-07-05_15-38-41
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.47754320835136
  episode_reward_mean: 0.61930676827058
  episode_reward_min: -39.82938783525944
  episodes_this_iter: 2
  episodes_total: 532
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.0927941305578206
    episode_reward_mean: 0.0927941305578206
    episode_reward_min: 0.0927941305578206
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.0927941305578206
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0.1440

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,202,799.049,1616,-0.287479,35.4775,-39.8294,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1632
  counters:
    num_agent_steps_sampled: 1632
    num_agent_steps_trained: 1632
    num_env_steps_sampled: 1632
    num_env_steps_trained: 1632
  custom_metrics: {}
  date: 2022-07-05_15-38-54
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.47754320835136
  episode_reward_mean: -0.013113904036337967
  episode_reward_min: -39.82938783525944
  episodes_this_iter: 4
  episodes_total: 544
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3799270391464233
          entropy_coeff: 0.0
          kl: 1.4024326446815394e-05
          model: {}
          policy_loss: -0.3939644992351532
          total_loss: -0.04511724412441254
          vf_explained_var: 0.05712488666176796
          vf

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,204,805.77,1632,-0.0131139,35.4775,-39.8294,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,204,805.77,1632,-0.0131139,35.4775,-39.8294,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1640
  counters:
    num_agent_steps_sampled: 1640
    num_agent_steps_trained: 1640
    num_env_steps_sampled: 1640
    num_env_steps_trained: 1640
  custom_metrics: {}
  date: 2022-07-05_15-39-00
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.47754320835136
  episode_reward_mean: 0.0006943099003707564
  episode_reward_min: -39.82938783525944
  episodes_this_iter: 2
  episodes_total: 546
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.061612636509530194
    episode_reward_mean: -0.061612636509530194
    episode_reward_min: -0.061612636509530194
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.061612636509530194
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_proc

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,206,816.276,1648,-0.0126191,35.4775,-39.8294,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1656
  counters:
    num_agent_steps_sampled: 1656
    num_agent_steps_trained: 1656
    num_env_steps_sampled: 1656
    num_env_steps_trained: 1656
  custom_metrics: {}
  date: 2022-07-05_15-39-08
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.47754320835136
  episode_reward_mean: 0.27366804199514505
  episode_reward_min: -39.82938783525944
  episodes_this_iter: 4
  episodes_total: 552
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.377416729927063
          entropy_coeff: 0.0
          kl: 5.06258693349082e-05
          model: {}
          policy_loss: -0.3759719431400299
          total_loss: 1.6271474361419678
          vf_explained_var: 0.0013741691363975406
          vf_loss

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,208,824.375,1664,-0.0122948,35.4775,-39.8294,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1672
  counters:
    num_agent_steps_sampled: 1672
    num_agent_steps_trained: 1672
    num_env_steps_sampled: 1672
    num_env_steps_trained: 1672
  custom_metrics: {}
  date: 2022-07-05_15-39-16
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.47754320835136
  episode_reward_mean: 0.0006406392346945389
  episode_reward_min: -39.82938783525944
  episodes_this_iter: 2
  episodes_total: 556
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3812634944915771
          entropy_coeff: 0.0
          kl: 2.0073997802683152e-05
          model: {}
          policy_loss: -0.017071697860956192
          total_loss: 0.6145417094230652
          vf_explained_var: -0.025531355291604996
          

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,209,828.174,1672,0.000640639,35.4775,-39.8294,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1680
  counters:
    num_agent_steps_sampled: 1680
    num_agent_steps_trained: 1680
    num_env_steps_sampled: 1680
    num_env_steps_trained: 1680
  custom_metrics: {}
  date: 2022-07-05_15-39-23
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.47754320835136
  episode_reward_mean: 0.27805800955172094
  episode_reward_min: -39.82938783525944
  episodes_this_iter: 4
  episodes_total: 560
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -1.441339278033468
    episode_reward_mean: -1.441339278033468
    episode_reward_min: -1.441339278033468
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -1.441339278033468
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0.1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,211,838.83,1688,0.0305719,35.4775,-39.8294,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1696
  counters:
    num_agent_steps_sampled: 1696
    num_agent_steps_trained: 1696
    num_env_steps_sampled: 1696
    num_env_steps_trained: 1696
  custom_metrics: {}
  date: 2022-07-05_15-39-30
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.47754320835136
  episode_reward_mean: -0.014087023141006598
  episode_reward_min: -39.82938783525944
  episodes_this_iter: 2
  episodes_total: 564
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3825500011444092
          entropy_coeff: 0.0
          kl: 3.803010940828244e-06
          model: {}
          policy_loss: -0.287329763174057
          total_loss: 5.48217248916626
          vf_explained_var: -0.0013874828582629561
          vf_lo

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,213,844.798,1704,0.573841,35.4775,-39.8294,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1712
  counters:
    num_agent_steps_sampled: 1712
    num_agent_steps_trained: 1712
    num_env_steps_sampled: 1712
    num_env_steps_trained: 1712
  custom_metrics: {}
  date: 2022-07-05_15-39-36
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.47754320835136
  episode_reward_mean: 0.0024545825414762136
  episode_reward_min: -39.82938783525944
  episodes_this_iter: 2
  episodes_total: 570
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3764458894729614
          entropy_coeff: 0.0
          kl: 3.806267386607942e-06
          model: {}
          policy_loss: -0.5109497308731079
          total_loss: 1.5952471494674683
          vf_explained_var: -0.04556975141167641
          vf_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,214,847.692,1712,0.00245458,35.4775,-39.8294,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1720
  counters:
    num_agent_steps_sampled: 1720
    num_agent_steps_trained: 1720
    num_env_steps_sampled: 1720
    num_env_steps_trained: 1720
  custom_metrics: {}
  date: 2022-07-05_15-39-43
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.47754320835136
  episode_reward_mean: -0.06178242029314687
  episode_reward_min: -39.82938783525944
  episodes_this_iter: 2
  episodes_total: 572
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 1.4443072371752428
    episode_reward_mean: 1.4443072371752428
    episode_reward_min: 1.4443072371752428
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 1.4443072371752428
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,217,860.898,1736,-0.256576,35.4775,-39.8294,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1752
  counters:
    num_agent_steps_sampled: 1752
    num_agent_steps_trained: 1752
    num_env_steps_sampled: 1752
    num_env_steps_trained: 1752
  custom_metrics: {}
  date: 2022-07-05_15-39-57
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.47754320835136
  episode_reward_mean: -0.3303494552935811
  episode_reward_min: -39.82938783525944
  episodes_this_iter: 4
  episodes_total: 584
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.380912184715271
          entropy_coeff: 0.0
          kl: 1.0930610187642742e-05
          model: {}
          policy_loss: -0.003243439830839634
          total_loss: 3.9968698024749756
          vf_explained_var: 0.000850677490234375
          vf_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,219,868.478,1752,-0.330349,35.4775,-39.8294,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,219,868.478,1752,-0.330349,35.4775,-39.8294,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1760
  counters:
    num_agent_steps_sampled: 1760
    num_agent_steps_trained: 1760
    num_env_steps_sampled: 1760
    num_env_steps_trained: 1760
  custom_metrics: {}
  date: 2022-07-05_15-40-03
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.47754320835136
  episode_reward_mean: -0.14217387800817488
  episode_reward_min: -39.82938783525944
  episodes_this_iter: 2
  episodes_total: 586
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -1.3350275696487388
    episode_reward_mean: -1.3350275696487388
    episode_reward_min: -1.3350275696487388
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -1.3350275696487388
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,222,880.279,1776,-0.00927508,35.4775,-34.373,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1792
  counters:
    num_agent_steps_sampled: 1792
    num_agent_steps_trained: 1792
    num_env_steps_sampled: 1792
    num_env_steps_trained: 1792
  custom_metrics: {}
  date: 2022-07-05_15-40-15
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.51356837488812
  episode_reward_mean: 0.4137870467136473
  episode_reward_min: -34.373019185827154
  episodes_this_iter: 2
  episodes_total: 596
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3726792335510254
          entropy_coeff: 0.0
          kl: 6.52436756354291e-06
          model: {}
          policy_loss: -0.48741334676742554
          total_loss: 3.5551540851593018
          vf_explained_var: 0.0023825527168810368
          vf_lo

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,224,885.813,1792,0.413787,39.5136,-34.373,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,224,885.813,1792,0.413787,39.5136,-34.373,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1800
  counters:
    num_agent_steps_sampled: 1800
    num_agent_steps_trained: 1800
    num_env_steps_sampled: 1800
    num_env_steps_trained: 1800
  custom_metrics: {}
  date: 2022-07-05_15-40-20
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.51356837488812
  episode_reward_mean: 0.6285659443428805
  episode_reward_min: -34.373019185827154
  episodes_this_iter: 4
  episodes_total: 600
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.12051406057169722
    episode_reward_mean: -0.12051406057169722
    episode_reward_min: -0.12051406057169722
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.12051406057169722
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,227,895.795,1816,0.299064,39.5136,-34.373,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1824
  counters:
    num_agent_steps_sampled: 1824
    num_agent_steps_trained: 1824
    num_env_steps_sampled: 1824
    num_env_steps_trained: 1824
  custom_metrics: {}
  date: 2022-07-05_15-40-29
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.51356837488812
  episode_reward_mean: 0.014320922032052716
  episode_reward_min: -34.373019185827154
  episodes_this_iter: 4
  episodes_total: 608
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.379868745803833
          entropy_coeff: 0.0
          kl: 8.753112524573226e-06
          model: {}
          policy_loss: 0.22206343710422516
          total_loss: 2.222308397293091
          vf_explained_var: -0.0017144441371783614
          vf_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,229,902.543,1832,0.646147,39.5136,-34.373,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,229,902.543,1832,0.646147,39.5136,-34.373,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1840
  counters:
    num_agent_steps_sampled: 1840
    num_agent_steps_trained: 1840
    num_env_steps_sampled: 1840
    num_env_steps_trained: 1840
  custom_metrics: {}
  date: 2022-07-05_15-40-38
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.51356837488812
  episode_reward_mean: 0.01425847354261954
  episode_reward_min: -34.373019185827154
  episodes_this_iter: 2
  episodes_total: 612
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.006168259268979126
    episode_reward_mean: 0.006168259268979126
    episode_reward_min: 0.006168259268979126
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.006168259268979126
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processin

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,231,912.676,1848,0.30357,39.5136,-34.373,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1856
  counters:
    num_agent_steps_sampled: 1856
    num_agent_steps_trained: 1856
    num_env_steps_sampled: 1856
    num_env_steps_trained: 1856
  custom_metrics: {}
  date: 2022-07-05_15-40-46
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.51356837488812
  episode_reward_mean: 0.29132915085342803
  episode_reward_min: -34.373019185827154
  episodes_this_iter: 2
  episodes_total: 618
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3807545900344849
          entropy_coeff: 0.0
          kl: 1.4007255231263116e-06
          model: {}
          policy_loss: -0.16882209479808807
          total_loss: 0.8712654113769531
          vf_explained_var: 0.020723531022667885
          vf_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,233,919.564,1864,0.239915,39.5136,-34.373,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1872
  counters:
    num_agent_steps_sampled: 1872
    num_agent_steps_trained: 1872
    num_env_steps_sampled: 1872
    num_env_steps_trained: 1872
  custom_metrics: {}
  date: 2022-07-05_15-40-53
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.51356837488812
  episode_reward_mean: 0.026129058597161456
  episode_reward_min: -34.373019185827154
  episodes_this_iter: 4
  episodes_total: 624
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3854104280471802
          entropy_coeff: 0.0
          kl: 1.9339263417350594e-06
          model: {}
          policy_loss: 0.22531892359256744
          total_loss: 2.2268826961517334
          vf_explained_var: -0.00031089383992366493
          

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,234,923.427,1872,0.0261291,39.5136,-34.373,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1880
  counters:
    num_agent_steps_sampled: 1880
    num_agent_steps_trained: 1880
    num_env_steps_sampled: 1880
    num_env_steps_trained: 1880
  custom_metrics: {}
  date: 2022-07-05_15-40-58
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.51356837488812
  episode_reward_mean: 0.0330806525141997
  episode_reward_min: -34.373019185827154
  episodes_this_iter: 2
  episodes_total: 626
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.007635691293326863
    episode_reward_mean: -0.007635691293326863
    episode_reward_min: -0.007635691293326863
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.007635691293326863
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_proces

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,237,935.093,1896,-0.298504,39.5136,-34.373,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1912
  counters:
    num_agent_steps_sampled: 1912
    num_agent_steps_trained: 1912
    num_env_steps_sampled: 1912
    num_env_steps_trained: 1912
  custom_metrics: {}
  date: 2022-07-05_15-41-12
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.51356837488812
  episode_reward_mean: -0.25162236560275686
  episode_reward_min: -37.57764111347744
  episodes_this_iter: 2
  episodes_total: 636
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3789061307907104
          entropy_coeff: 0.0
          kl: 4.9258393119089305e-05
          model: {}
          policy_loss: 0.2166845202445984
          total_loss: 2.218311071395874
          vf_explained_var: -0.00039915242814458907
          vf_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,239,942.16,1912,-0.251622,39.5136,-37.5776,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,239,942.16,1912,-0.251622,39.5136,-37.5776,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1920
  counters:
    num_agent_steps_sampled: 1920
    num_agent_steps_trained: 1920
    num_env_steps_sampled: 1920
    num_env_steps_trained: 1920
  custom_metrics: {}
  date: 2022-07-05_15-41-18
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.51356837488812
  episode_reward_mean: 0.30097187981880535
  episode_reward_min: -37.57764111347744
  episodes_this_iter: 4
  episodes_total: 640
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.07867072788649698
    episode_reward_mean: 0.07867072788649698
    episode_reward_min: 0.07867072788649698
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.07867072788649698
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,242,955.332,1936,0.400174,39.5136,-37.5776,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1952
  counters:
    num_agent_steps_sampled: 1952
    num_agent_steps_trained: 1952
    num_env_steps_sampled: 1952
    num_env_steps_trained: 1952
  custom_metrics: {}
  date: 2022-07-05_15-41-32
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.51356837488812
  episode_reward_mean: 0.02585272673736875
  episode_reward_min: -37.86066125426519
  episodes_this_iter: 2
  episodes_total: 650
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3837971687316895
          entropy_coeff: 0.0
          kl: 3.4715167203103192e-06
          model: {}
          policy_loss: 0.006093600764870644
          total_loss: 0.6909956336021423
          vf_explained_var: 0.0029791593551635742
          vf_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,244,962.424,1952,0.0258527,39.5136,-37.8607,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,244,962.424,1952,0.0258527,39.5136,-37.8607,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1960
  counters:
    num_agent_steps_sampled: 1960
    num_agent_steps_trained: 1960
    num_env_steps_sampled: 1960
    num_env_steps_trained: 1960
  custom_metrics: {}
  date: 2022-07-05_15-41-38
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.51356837488812
  episode_reward_mean: -0.24633499087692118
  episode_reward_min: -37.86066125426519
  episodes_this_iter: 2
  episodes_total: 652
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.05122246524477703
    episode_reward_mean: 0.05122246524477703
    episode_reward_min: 0.05122246524477703
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.05122246524477703
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,247,975.568,1976,0.0338072,39.5136,-37.8607,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 1992
  counters:
    num_agent_steps_sampled: 1992
    num_agent_steps_trained: 1992
    num_env_steps_sampled: 1992
    num_env_steps_trained: 1992
  custom_metrics: {}
  date: 2022-07-05_15-41-53
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.51356837488812
  episode_reward_mean: -0.24743294098204544
  episode_reward_min: -37.86066125426519
  episodes_this_iter: 4
  episodes_total: 664
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3820239305496216
          entropy_coeff: 0.0
          kl: 2.2890740183356684e-06
          model: {}
          policy_loss: -0.5310252904891968
          total_loss: 1.4703408479690552
          vf_explained_var: -8.618831634521484e-05
          vf

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,249,983.38,1992,-0.247433,39.5136,-37.8607,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,249,983.38,1992,-0.247433,39.5136,-37.8607,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2000
  counters:
    num_agent_steps_sampled: 2000
    num_agent_steps_trained: 2000
    num_env_steps_sampled: 2000
    num_env_steps_trained: 2000
  custom_metrics: {}
  date: 2022-07-05_15-41-59
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.51356837488812
  episode_reward_mean: -0.21777695670783956
  episode_reward_min: -37.86066125426519
  episodes_this_iter: 2
  episodes_total: 666
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.011326670361100355
    episode_reward_mean: -0.011326670361100355
    episode_reward_min: -0.011326670361100355
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.011326670361100355
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_proce

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,252,996.623,2016,-0.217905,39.5136,-37.8607,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2032
  counters:
    num_agent_steps_sampled: 2032
    num_agent_steps_trained: 2032
    num_env_steps_sampled: 2032
    num_env_steps_trained: 2032
  custom_metrics: {}
  date: 2022-07-05_15-42-14
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.51356837488812
  episode_reward_mean: -0.004438960801534759
  episode_reward_min: -37.86066125426519
  episodes_this_iter: 2
  episodes_total: 676
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3833101987838745
          entropy_coeff: 0.0
          kl: 1.4591651961382013e-05
          model: {}
          policy_loss: -0.10728109627962112
          total_loss: 4.4417405128479
          vf_explained_var: 0.000427289807703346
          vf_lo

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,254,1004.01,2032,-0.00443896,39.5136,-37.8607,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,254,1004.01,2032,-0.00443896,39.5136,-37.8607,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2040
  counters:
    num_agent_steps_sampled: 2040
    num_agent_steps_trained: 2040
    num_env_steps_sampled: 2040
    num_env_steps_trained: 2040
  custom_metrics: {}
  date: 2022-07-05_15-42-20
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.51356837488812
  episode_reward_mean: -0.04711438838188576
  episode_reward_min: -37.86066125426519
  episodes_this_iter: 4
  episodes_total: 680
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.07572814757499957
    episode_reward_mean: -0.07572814757499957
    episode_reward_min: -0.07572814757499957
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.07572814757499957
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processin

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,257,1016.87,2056,-0.292087,39.5136,-37.8607,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2072
  counters:
    num_agent_steps_sampled: 2072
    num_agent_steps_trained: 2072
    num_env_steps_sampled: 2072
    num_env_steps_trained: 2072
  custom_metrics: {}
  date: 2022-07-05_15-42-35
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.51356837488812
  episode_reward_mean: 0.4497330066186467
  episode_reward_min: -37.86066125426519
  episodes_this_iter: 2
  episodes_total: 690
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3778153657913208
          entropy_coeff: 0.0
          kl: 5.7814227147900965e-06
          model: {}
          policy_loss: 0.17002536356449127
          total_loss: 2.171694040298462
          vf_explained_var: -0.0010776002891361713
          vf_lo

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,259,1024.1,2072,0.449733,39.5136,-37.8607,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,259,1024.1,2072,0.449733,39.5136,-37.8607,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2080
  counters:
    num_agent_steps_sampled: 2080
    num_agent_steps_trained: 2080
    num_env_steps_sampled: 2080
    num_env_steps_trained: 2080
  custom_metrics: {}
  date: 2022-07-05_15-42-40
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.51356837488812
  episode_reward_mean: 0.4995845049465108
  episode_reward_min: -37.86066125426519
  episodes_this_iter: 2
  episodes_total: 692
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.049920499213259806
    episode_reward_mean: 0.049920499213259806
    episode_reward_min: 0.049920499213259806
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.049920499213259806
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,262,1036.11,2096,-0.670141,37.9987,-37.8607,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2112
  counters:
    num_agent_steps_sampled: 2112
    num_agent_steps_trained: 2112
    num_env_steps_sampled: 2112
    num_env_steps_trained: 2112
  custom_metrics: {}
  date: 2022-07-05_15-42-54
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.99874232229823
  episode_reward_mean: 0.08745504224046922
  episode_reward_min: -37.86066125426519
  episodes_this_iter: 4
  episodes_total: 704
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3802071809768677
          entropy_coeff: 0.0
          kl: 3.6929147881892277e-06
          model: {}
          policy_loss: 0.5713880062103271
          total_loss: 0.574289083480835
          vf_explained_var: 0.11565154045820236
          vf_loss:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,264,1043.27,2112,0.087455,37.9987,-37.8607,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,264,1043.27,2112,0.087455,37.9987,-37.8607,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2120
  counters:
    num_agent_steps_sampled: 2120
    num_agent_steps_trained: 2120
    num_env_steps_sampled: 2120
    num_env_steps_trained: 2120
  custom_metrics: {}
  date: 2022-07-05_15-43-00
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.99874232229823
  episode_reward_mean: -0.00019217390157542978
  episode_reward_min: -37.86066125426519
  episodes_this_iter: 2
  episodes_total: 706
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.012003828426398133
    episode_reward_mean: -0.012003828426398133
    episode_reward_min: -0.012003828426398133
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.012003828426398133
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_pr

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,267,1055.65,2136,0.0127253,37.9987,-37.8709,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2152
  counters:
    num_agent_steps_sampled: 2152
    num_agent_steps_trained: 2152
    num_env_steps_sampled: 2152
    num_env_steps_trained: 2152
  custom_metrics: {}
  date: 2022-07-05_15-43-14
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.99874232229823
  episode_reward_mean: -0.2906886987926691
  episode_reward_min: -37.87087980744289
  episodes_this_iter: 2
  episodes_total: 716
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.376981258392334
          entropy_coeff: 0.0
          kl: 1.8002643855652423e-06
          model: {}
          policy_loss: 0.42506811022758484
          total_loss: 2.425208330154419
          vf_explained_var: 0.00017761786875780672
          vf_lo

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,269,1062.67,2152,-0.290689,37.9987,-37.8709,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,269,1062.67,2152,-0.290689,37.9987,-37.8709,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2160
  counters:
    num_agent_steps_sampled: 2160
    num_agent_steps_trained: 2160
    num_env_steps_sampled: 2160
    num_env_steps_trained: 2160
  custom_metrics: {}
  date: 2022-07-05_15-43-19
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.99874232229823
  episode_reward_mean: 0.13627543617448992
  episode_reward_min: -37.87087980744289
  episodes_this_iter: 4
  episodes_total: 720
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.07292342114468908
    episode_reward_mean: -0.07292342114468908
    episode_reward_min: -0.07292342114468908
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.07292342114468908
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,272,1075.84,2176,0.322422,37.9987,-37.8709,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2192
  counters:
    num_agent_steps_sampled: 2192
    num_agent_steps_trained: 2192
    num_env_steps_sampled: 2192
    num_env_steps_trained: 2192
  custom_metrics: {}
  date: 2022-07-05_15-43-35
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.99874232229823
  episode_reward_mean: -0.014296681626003931
  episode_reward_min: -37.87087980744289
  episodes_this_iter: 2
  episodes_total: 730
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3819829225540161
          entropy_coeff: 0.0
          kl: 2.1749354345956817e-05
          model: {}
          policy_loss: -0.2157408893108368
          total_loss: 0.13618721067905426
          vf_explained_var: -0.003928490448743105
          v

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,274,1083.49,2192,-0.0142967,37.9987,-37.8709,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,274,1083.49,2192,-0.0142967,37.9987,-37.8709,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2200
  counters:
    num_agent_steps_sampled: 2200
    num_agent_steps_trained: 2200
    num_env_steps_sampled: 2200
    num_env_steps_trained: 2200
  custom_metrics: {}
  date: 2022-07-05_15-43-41
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.73681409490054
  episode_reward_mean: -0.3347514908791674
  episode_reward_min: -37.87087980744289
  episodes_this_iter: 2
  episodes_total: 732
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.013629178893729277
    episode_reward_mean: 0.013629178893729277
    episode_reward_min: 0.013629178893729277
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.013629178893729277
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,277,1095.95,2216,0.395624,37.7368,-37.8709,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2232
  counters:
    num_agent_steps_sampled: 2232
    num_agent_steps_trained: 2232
    num_env_steps_sampled: 2232
    num_env_steps_trained: 2232
  custom_metrics: {}
  date: 2022-07-05_15-43-54
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.73681409490054
  episode_reward_mean: -0.05386871723129239
  episode_reward_min: -37.87087980744289
  episodes_this_iter: 4
  episodes_total: 744
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3766199350357056
          entropy_coeff: 0.0
          kl: 7.840334546926897e-06
          model: {}
          policy_loss: 0.16348201036453247
          total_loss: 2.163583517074585
          vf_explained_var: 0.001037164474837482
          vf_los

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,279,1102.21,2232,-0.0538687,37.7368,-37.8709,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,279,1102.21,2232,-0.0538687,37.7368,-37.8709,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2240
  counters:
    num_agent_steps_sampled: 2240
    num_agent_steps_trained: 2240
    num_env_steps_sampled: 2240
    num_env_steps_trained: 2240
  custom_metrics: {}
  date: 2022-07-05_15-44-00
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.73681409490054
  episode_reward_mean: -0.10198389331884221
  episode_reward_min: -37.87087980744289
  episodes_this_iter: 2
  episodes_total: 746
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.062244990486512775
    episode_reward_mean: 0.062244990486512775
    episode_reward_min: 0.062244990486512775
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.062244990486512775
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processin

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,282,1114.13,2256,0.371033,37.9517,-37.8709,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2272
  counters:
    num_agent_steps_sampled: 2272
    num_agent_steps_trained: 2272
    num_env_steps_sampled: 2272
    num_env_steps_trained: 2272
  custom_metrics: {}
  date: 2022-07-05_15-44-12
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.951700149884786
  episode_reward_mean: 0.3283007955685096
  episode_reward_min: -37.87087980744289
  episodes_this_iter: 2
  episodes_total: 756
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.383729338645935
          entropy_coeff: 0.0
          kl: 2.2426387658924796e-05
          model: {}
          policy_loss: 0.3128996789455414
          total_loss: 8.31301498413086
          vf_explained_var: -0.001788834691978991
          vf_loss:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,284,1120.21,2272,0.328301,37.9517,-37.8709,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,284,1120.21,2272,0.328301,37.9517,-37.8709,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2280
  counters:
    num_agent_steps_sampled: 2280
    num_agent_steps_trained: 2280
    num_env_steps_sampled: 2280
    num_env_steps_trained: 2280
  custom_metrics: {}
  date: 2022-07-05_15-44-18
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.951700149884786
  episode_reward_mean: 0.3424970260005331
  episode_reward_min: -37.87087980744289
  episodes_this_iter: 4
  episodes_total: 760
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 1.3358627072606484
    episode_reward_mean: 1.3358627072606484
    episode_reward_min: 1.3358627072606484
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 1.3358627072606484
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0.1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,287,1134.15,2296,-0.0134244,37.9517,-37.8709,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2312
  counters:
    num_agent_steps_sampled: 2312
    num_agent_steps_trained: 2312
    num_env_steps_sampled: 2312
    num_env_steps_trained: 2312
  custom_metrics: {}
  date: 2022-07-05_15-44-33
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.951700149884786
  episode_reward_mean: 0.3265053878209475
  episode_reward_min: -37.87087980744289
  episodes_this_iter: 2
  episodes_total: 770
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3735864162445068
          entropy_coeff: 0.0
          kl: 2.8259551072551403e-06
          model: {}
          policy_loss: -0.363252192735672
          total_loss: 0.16787239909172058
          vf_explained_var: 0.039276860654354095
          vf_lo

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,289,1141.57,2312,0.326505,37.9517,-37.8709,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,289,1141.57,2312,0.326505,37.9517,-37.8709,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2320
  counters:
    num_agent_steps_sampled: 2320
    num_agent_steps_trained: 2320
    num_env_steps_sampled: 2320
    num_env_steps_trained: 2320
  custom_metrics: {}
  date: 2022-07-05_15-44-40
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.951700149884786
  episode_reward_mean: -0.012368118202705532
  episode_reward_min: -37.87087980744289
  episodes_this_iter: 2
  episodes_total: 772
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.01993644116260107
    episode_reward_mean: -0.01993644116260107
    episode_reward_min: -0.01993644116260107
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.01993644116260107
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_process

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,291,1152.04,2328,-0.30124,37.9517,-37.8709,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2336
  counters:
    num_agent_steps_sampled: 2336
    num_agent_steps_trained: 2336
    num_env_steps_sampled: 2336
    num_env_steps_trained: 2336
  custom_metrics: {}
  date: 2022-07-05_15-44-47
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.951700149884786
  episode_reward_mean: -0.2239637223613501
  episode_reward_min: -37.87087980744289
  episodes_this_iter: 2
  episodes_total: 778
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3758131265640259
          entropy_coeff: 0.0
          kl: 1.749909824866336e-05
          model: {}
          policy_loss: -0.034036748111248016
          total_loss: 0.5423896312713623
          vf_explained_var: 0.027281025424599648
          vf_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,293,1159.34,2344,-0.211103,37.9517,-37.8709,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2352
  counters:
    num_agent_steps_sampled: 2352
    num_agent_steps_trained: 2352
    num_env_steps_sampled: 2352
    num_env_steps_trained: 2352
  custom_metrics: {}
  date: 2022-07-05_15-44-55
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.951700149884786
  episode_reward_mean: 0.3244715863993328
  episode_reward_min: -37.87087980744289
  episodes_this_iter: 4
  episodes_total: 784
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3791905641555786
          entropy_coeff: 0.0
          kl: 7.89565183367813e-06
          model: {}
          policy_loss: -0.23393514752388
          total_loss: 3.046372890472412
          vf_explained_var: -0.0001299142895732075
          vf_loss:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,294,1163.37,2352,0.324472,37.9517,-37.8709,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2360
  counters:
    num_agent_steps_sampled: 2360
    num_agent_steps_trained: 2360
    num_env_steps_sampled: 2360
    num_env_steps_trained: 2360
  custom_metrics: {}
  date: 2022-07-05_15-45-02
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.951700149884786
  episode_reward_mean: -0.0006075006053513121
  episode_reward_min: -37.87087980744289
  episodes_this_iter: 2
  episodes_total: 786
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -1.3231911901968751
    episode_reward_mean: -1.3231911901968751
    episode_reward_min: -1.3231911901968751
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -1.3231911901968751
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,297,1177.09,2376,-0.390915,37.9517,-37.8709,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2392
  counters:
    num_agent_steps_sampled: 2392
    num_agent_steps_trained: 2392
    num_env_steps_sampled: 2392
    num_env_steps_trained: 2392
  custom_metrics: {}
  date: 2022-07-05_15-45-16
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.951700149884786
  episode_reward_mean: 0.020230075849545944
  episode_reward_min: -37.87087980744289
  episodes_this_iter: 2
  episodes_total: 796
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3651820421218872
          entropy_coeff: 0.0
          kl: 4.259370325598866e-05
          model: {}
          policy_loss: 0.24429260194301605
          total_loss: 2.85448956489563
          vf_explained_var: -0.0021964549086987972
          vf_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,299,1183.82,2392,0.0202301,37.9517,-37.8709,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,299,1183.82,2392,0.0202301,37.9517,-37.8709,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2400
  counters:
    num_agent_steps_sampled: 2400
    num_agent_steps_trained: 2400
    num_env_steps_sampled: 2400
    num_env_steps_trained: 2400
  custom_metrics: {}
  date: 2022-07-05_15-45-22
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.951700149884786
  episode_reward_mean: -0.012464612674121595
  episode_reward_min: -37.87087980744289
  episodes_this_iter: 4
  episodes_total: 800
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.05067898072733712
    episode_reward_mean: 0.05067898072733712
    episode_reward_min: 0.05067898072733712
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.05067898072733712
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,302,1196.56,2416,0.00239642,37.9517,-37.8709,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2432
  counters:
    num_agent_steps_sampled: 2432
    num_agent_steps_trained: 2432
    num_env_steps_sampled: 2432
    num_env_steps_trained: 2432
  custom_metrics: {}
  date: 2022-07-05_15-45-36
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.951700149884786
  episode_reward_mean: 0.0743738803464005
  episode_reward_min: -37.87087980744289
  episodes_this_iter: 2
  episodes_total: 810
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3760864734649658
          entropy_coeff: 0.0
          kl: 7.272712423400662e-07
          model: {}
          policy_loss: -0.3500292897224426
          total_loss: -0.33961138129234314
          vf_explained_var: -0.42713743448257446
          vf_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,304,1203.81,2432,0.0743739,37.9517,-37.8709,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,304,1203.81,2432,0.0743739,37.9517,-37.8709,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2440
  counters:
    num_agent_steps_sampled: 2440
    num_agent_steps_trained: 2440
    num_env_steps_sampled: 2440
    num_env_steps_trained: 2440
  custom_metrics: {}
  date: 2022-07-05_15-45-42
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.951700149884786
  episode_reward_mean: 0.3733474764040077
  episode_reward_min: -37.54095571297353
  episodes_this_iter: 2
  episodes_total: 812
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.004272588470566552
    episode_reward_mean: -0.004272588470566552
    episode_reward_min: -0.004272588470566552
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.004272588470566552
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_proces

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,307,1214.72,2456,0.682238,37.9517,-38.6719,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2472
  counters:
    num_agent_steps_sampled: 2472
    num_agent_steps_trained: 2472
    num_env_steps_sampled: 2472
    num_env_steps_trained: 2472
  custom_metrics: {}
  date: 2022-07-05_15-45-53
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.951700149884786
  episode_reward_mean: -0.027275562305967557
  episode_reward_min: -38.671865721941174
  episodes_this_iter: 4
  episodes_total: 824
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3685966730117798
          entropy_coeff: 0.0
          kl: 9.553452400723472e-06
          model: {}
          policy_loss: -0.13002605736255646
          total_loss: 5.870068073272705
          vf_explained_var: -0.0023862600792199373
          

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,309,1220.55,2472,-0.0272756,37.9517,-38.6719,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,309,1220.55,2472,-0.0272756,37.9517,-38.6719,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2480
  counters:
    num_agent_steps_sampled: 2480
    num_agent_steps_trained: 2480
    num_env_steps_sampled: 2480
    num_env_steps_trained: 2480
  custom_metrics: {}
  date: 2022-07-05_15-45-59
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.951700149884786
  episode_reward_mean: -0.013744609916330985
  episode_reward_min: -38.671865721941174
  episodes_this_iter: 2
  episodes_total: 826
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.0233698022732709
    episode_reward_mean: 0.0233698022732709
    episode_reward_min: 0.0233698022732709
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.0233698022732709
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,312,1232.24,2496,0.681885,37.9517,-38.6719,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2512
  counters:
    num_agent_steps_sampled: 2512
    num_agent_steps_trained: 2512
    num_env_steps_sampled: 2512
    num_env_steps_trained: 2512
  custom_metrics: {}
  date: 2022-07-05_15-46-12
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.951700149884786
  episode_reward_mean: -0.33823282601868526
  episode_reward_min: -38.671865721941174
  episodes_this_iter: 2
  episodes_total: 836
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.373751163482666
          entropy_coeff: 0.0
          kl: 7.275049574673176e-05
          model: {}
          policy_loss: -0.3118062913417816
          total_loss: -0.31077632308006287
          vf_explained_var: -0.05623684450984001
          vf

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,314,1239.54,2512,-0.338233,37.9517,-38.6719,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,314,1239.54,2512,-0.338233,37.9517,-38.6719,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2520
  counters:
    num_agent_steps_sampled: 2520
    num_agent_steps_trained: 2520
    num_env_steps_sampled: 2520
    num_env_steps_trained: 2520
  custom_metrics: {}
  date: 2022-07-05_15-46-18
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.951700149884786
  episode_reward_mean: -0.0010932979409148124
  episode_reward_min: -38.671865721941174
  episodes_this_iter: 4
  episodes_total: 840
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 36.45620516916407
    episode_reward_mean: 36.45620516916407
    episode_reward_min: 36.45620516916407
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 36.45620516916407
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,317,1252.32,2536,-0.12715,37.9517,-38.6719,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2552
  counters:
    num_agent_steps_sampled: 2552
    num_agent_steps_trained: 2552
    num_env_steps_sampled: 2552
    num_env_steps_trained: 2552
  custom_metrics: {}
  date: 2022-07-05_15-46-32
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.951700149884786
  episode_reward_mean: 0.2544532240796423
  episode_reward_min: -38.671865721941174
  episodes_this_iter: 2
  episodes_total: 850
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3766499757766724
          entropy_coeff: 0.0
          kl: 5.6944213611132e-06
          model: {}
          policy_loss: 0.376300185918808
          total_loss: 0.4074722230434418
          vf_explained_var: 0.1295095831155777
          vf_loss: 0.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,319,1258.87,2552,0.254453,37.9517,-38.6719,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,319,1258.87,2552,0.254453,37.9517,-38.6719,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2560
  counters:
    num_agent_steps_sampled: 2560
    num_agent_steps_trained: 2560
    num_env_steps_sampled: 2560
    num_env_steps_trained: 2560
  custom_metrics: {}
  date: 2022-07-05_15-46-37
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.03106692828413
  episode_reward_mean: -0.06002280631307391
  episode_reward_min: -38.671865721941174
  episodes_this_iter: 2
  episodes_total: 852
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.8127308330997636
    episode_reward_mean: -0.8127308330997636
    episode_reward_min: -0.8127308330997636
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.8127308330997636
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_m

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,322,1271.09,2576,-0.352851,35.0311,-38.6719,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2592
  counters:
    num_agent_steps_sampled: 2592
    num_agent_steps_trained: 2592
    num_env_steps_sampled: 2592
    num_env_steps_trained: 2592
  custom_metrics: {}
  date: 2022-07-05_15-46-51
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.03106692828413
  episode_reward_mean: 0.013065003650521132
  episode_reward_min: -38.671865721941174
  episodes_this_iter: 4
  episodes_total: 864
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.376616358757019
          entropy_coeff: 0.0
          kl: 5.560476438404294e-06
          model: {}
          policy_loss: 0.5292600393295288
          total_loss: 0.9012609124183655
          vf_explained_var: -0.045191440731287
          vf_loss:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,324,1277.89,2592,0.013065,35.0311,-38.6719,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,324,1277.89,2592,0.013065,35.0311,-38.6719,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2600
  counters:
    num_agent_steps_sampled: 2600
    num_agent_steps_trained: 2600
    num_env_steps_sampled: 2600
    num_env_steps_trained: 2600
  custom_metrics: {}
  date: 2022-07-05_15-46-57
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 35.03106692828413
  episode_reward_mean: -0.012486179091601395
  episode_reward_min: -38.671865721941174
  episodes_this_iter: 2
  episodes_total: 866
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.5150395596983728
    episode_reward_mean: 0.5150395596983728
    episode_reward_min: 0.5150395596983728
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.5150395596983728
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,327,1290.24,2616,0.0276656,35.0311,-38.6719,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2632
  counters:
    num_agent_steps_sampled: 2632
    num_agent_steps_trained: 2632
    num_env_steps_sampled: 2632
    num_env_steps_trained: 2632
  custom_metrics: {}
  date: 2022-07-05_15-47-10
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.967849081178755
  episode_reward_mean: 0.4077076149598098
  episode_reward_min: -38.671865721941174
  episodes_this_iter: 2
  episodes_total: 876
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3809226751327515
          entropy_coeff: 0.0
          kl: 7.105670192686375e-06
          model: {}
          policy_loss: -0.1880999058485031
          total_loss: 2.430574655532837
          vf_explained_var: -0.0028036197181791067
          vf_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,329,1297,2632,0.407708,37.9678,-38.6719,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,329,1297,2632,0.407708,37.9678,-38.6719,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2640
  counters:
    num_agent_steps_sampled: 2640
    num_agent_steps_trained: 2640
    num_env_steps_sampled: 2640
    num_env_steps_trained: 2640
  custom_metrics: {}
  date: 2022-07-05_15-47-16
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.967849081178755
  episode_reward_mean: 0.386202967764229
  episode_reward_min: -38.671865721941174
  episodes_this_iter: 4
  episodes_total: 880
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -36.204312325504254
    episode_reward_mean: -36.204312325504254
    episode_reward_min: -36.204312325504254
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -36.204312325504254
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,332,1309.04,2656,-0.322672,37.9678,-38.6719,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2672
  counters:
    num_agent_steps_sampled: 2672
    num_agent_steps_trained: 2672
    num_env_steps_sampled: 2672
    num_env_steps_trained: 2672
  custom_metrics: {}
  date: 2022-07-05_15-47-30
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.967849081178755
  episode_reward_mean: -0.3374231612808138
  episode_reward_min: -38.671865721941174
  episodes_this_iter: 2
  episodes_total: 890
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3717153072357178
          entropy_coeff: 0.0
          kl: 1.3407242477114778e-05
          model: {}
          policy_loss: -0.11568405479192734
          total_loss: 0.8376340270042419
          vf_explained_var: 0.005604161880910397
          vf

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,334,1316.5,2672,-0.337423,37.9678,-38.6719,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,334,1316.5,2672,-0.337423,37.9678,-38.6719,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2680
  counters:
    num_agent_steps_sampled: 2680
    num_agent_steps_trained: 2680
    num_env_steps_sampled: 2680
    num_env_steps_trained: 2680
  custom_metrics: {}
  date: 2022-07-05_15-47-36
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.967849081178755
  episode_reward_mean: -0.35138525710847973
  episode_reward_min: -38.671865721941174
  episodes_this_iter: 2
  episodes_total: 892
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.039973863210367
    episode_reward_mean: 0.039973863210367
    episode_reward_min: 0.039973863210367
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.039973863210367
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0.14

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,336,1326.53,2688,-0.00152691,37.9678,-38.6719,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2696
  counters:
    num_agent_steps_sampled: 2696
    num_agent_steps_trained: 2696
    num_env_steps_sampled: 2696
    num_env_steps_trained: 2696
  custom_metrics: {}
  date: 2022-07-05_15-47-44
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.967849081178755
  episode_reward_mean: 0.33388078378366887
  episode_reward_min: -38.671865721941174
  episodes_this_iter: 2
  episodes_total: 898
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3770434856414795
          entropy_coeff: 0.0
          kl: 1.4498463315248955e-05
          model: {}
          policy_loss: 0.32064083218574524
          total_loss: 0.38327133655548096
          vf_explained_var: 0.032514140009880066
          vf

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,338,1333.72,2704,0.283397,37.9678,-38.6719,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2712
  counters:
    num_agent_steps_sampled: 2712
    num_agent_steps_trained: 2712
    num_env_steps_sampled: 2712
    num_env_steps_trained: 2712
  custom_metrics: {}
  date: 2022-07-05_15-47-51
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.967849081178755
  episode_reward_mean: -0.10699576148304998
  episode_reward_min: -38.671865721941174
  episodes_this_iter: 4
  episodes_total: 904
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3743927478790283
          entropy_coeff: 0.0
          kl: 4.596910730469972e-05
          model: {}
          policy_loss: -0.06072460487484932
          total_loss: 5.9442596435546875
          vf_explained_var: -0.00017376343021169305
         

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,339,1336.83,2712,-0.106996,37.9678,-38.6719,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2720
  counters:
    num_agent_steps_sampled: 2720
    num_agent_steps_trained: 2720
    num_env_steps_sampled: 2720
    num_env_steps_trained: 2720
  custom_metrics: {}
  date: 2022-07-05_15-47-56
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 37.967849081178755
  episode_reward_mean: -0.4033419526156997
  episode_reward_min: -38.671865721941174
  episodes_this_iter: 2
  episodes_total: 906
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.05023742083916383
    episode_reward_mean: -0.05023742083916383
    episode_reward_min: -0.05023742083916383
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.05023742083916383
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processi

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,342,1349.66,2736,-0.399823,37.9678,-38.6719,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2752
  counters:
    num_agent_steps_sampled: 2752
    num_agent_steps_trained: 2752
    num_env_steps_sampled: 2752
    num_env_steps_trained: 2752
  custom_metrics: {}
  date: 2022-07-05_15-48-10
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.395293382185145
  episode_reward_mean: 0.44164261079552875
  episode_reward_min: -38.45515890975497
  episodes_this_iter: 2
  episodes_total: 916
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3675113916397095
          entropy_coeff: 0.0
          kl: 4.047679794894066e-06
          model: {}
          policy_loss: -0.36794957518577576
          total_loss: 7.6372151374816895
          vf_explained_var: -0.0005023002740927041
          vf

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,344,1356.27,2752,0.441643,38.3953,-38.4552,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,344,1356.27,2752,0.441643,38.3953,-38.4552,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2760
  counters:
    num_agent_steps_sampled: 2760
    num_agent_steps_trained: 2760
    num_env_steps_sampled: 2760
    num_env_steps_trained: 2760
  custom_metrics: {}
  date: 2022-07-05_15-48-16
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.395293382185145
  episode_reward_mean: -0.584801601742675
  episode_reward_min: -38.49390607994264
  episodes_this_iter: 4
  episodes_total: 920
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.05592191435664129
    episode_reward_mean: 0.05592191435664129
    episode_reward_min: 0.05592191435664129
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.05592191435664129
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,346,1366.25,2768,-0.32031,38.3953,-38.4939,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2776
  counters:
    num_agent_steps_sampled: 2776
    num_agent_steps_trained: 2776
    num_env_steps_sampled: 2776
    num_env_steps_trained: 2776
  custom_metrics: {}
  date: 2022-07-05_15-48-23
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.52516388071549
  episode_reward_mean: 0.4037291904074011
  episode_reward_min: -38.49390607994264
  episodes_this_iter: 2
  episodes_total: 924
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3795825242996216
          entropy_coeff: 0.0
          kl: 2.8748447675752686e-06
          model: {}
          policy_loss: -0.3268587291240692
          total_loss: 1.8238646984100342
          vf_explained_var: -0.0003317117807455361
          vf_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,348,1371.97,2784,-0.063614,38.5252,-38.4939,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2792
  counters:
    num_agent_steps_sampled: 2792
    num_agent_steps_trained: 2792
    num_env_steps_sampled: 2792
    num_env_steps_trained: 2792
  custom_metrics: {}
  date: 2022-07-05_15-48-29
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.52516388071549
  episode_reward_mean: 0.00714078348589041
  episode_reward_min: -38.49390607994264
  episodes_this_iter: 2
  episodes_total: 930
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3552151918411255
          entropy_coeff: 0.0
          kl: 1.4009793630975764e-05
          model: {}
          policy_loss: 0.19002743065357208
          total_loss: 10.190028190612793
          vf_explained_var: -0.0037580609787255526
          vf_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,350,1379.03,2800,-0.0740395,38.5252,-38.4939,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2808
  counters:
    num_agent_steps_sampled: 2808
    num_agent_steps_trained: 2808
    num_env_steps_sampled: 2808
    num_env_steps_trained: 2808
  custom_metrics: {}
  date: 2022-07-05_15-48-37
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.52516388071549
  episode_reward_mean: 0.28752607013004094
  episode_reward_min: -38.49390607994264
  episodes_this_iter: 4
  episodes_total: 936
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3781784772872925
          entropy_coeff: 0.0
          kl: 1.8595406800159253e-05
          model: {}
          policy_loss: -0.569390594959259
          total_loss: -0.524886965751648
          vf_explained_var: 0.0834096297621727
          vf_loss:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,352,1385.54,2816,0.255557,38.5252,-38.4939,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2824
  counters:
    num_agent_steps_sampled: 2824
    num_agent_steps_trained: 2824
    num_env_steps_sampled: 2824
    num_env_steps_trained: 2824
  custom_metrics: {}
  date: 2022-07-05_15-48-43
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.52516388071549
  episode_reward_mean: 0.27785988338209533
  episode_reward_min: -38.49390607994264
  episodes_this_iter: 2
  episodes_total: 940
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.38079833984375
          entropy_coeff: 0.0
          kl: 1.7198036630361457e-06
          model: {}
          policy_loss: 0.3901858627796173
          total_loss: 2.4936447143554688
          vf_explained_var: -0.004393311217427254
          vf_loss

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,354,1392.08,2832,0.0418995,38.5252,-38.4939,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,354,1392.08,2832,0.0418995,38.5252,-38.4939,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2840
  counters:
    num_agent_steps_sampled: 2840
    num_agent_steps_trained: 2840
    num_env_steps_sampled: 2840
    num_env_steps_trained: 2840
  custom_metrics: {}
  date: 2022-07-05_15-48-52
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 38.52516388071549
  episode_reward_mean: 0.10997135672959288
  episode_reward_min: -38.49390607994264
  episodes_this_iter: 2
  episodes_total: 946
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -0.8212619151261578
    episode_reward_mean: -0.8212619151261578
    episode_reward_min: -0.8212619151261578
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -0.8212619151261578
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,357,1404.76,2856,0.0587428,39.585,-38.4939,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2872
  counters:
    num_agent_steps_sampled: 2872
    num_agent_steps_trained: 2872
    num_env_steps_sampled: 2872
    num_env_steps_trained: 2872
  custom_metrics: {}
  date: 2022-07-05_15-49-06
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.58496904304526
  episode_reward_mean: 0.40184423041479733
  episode_reward_min: -38.49390607994264
  episodes_this_iter: 2
  episodes_total: 956
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3749558925628662
          entropy_coeff: 0.0
          kl: 1.9919696569559164e-05
          model: {}
          policy_loss: -0.04807257279753685
          total_loss: 5.952366352081299
          vf_explained_var: -0.0012754718773066998
          vf_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,359,1411.45,2872,0.401844,39.585,-38.4939,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,359,1411.45,2872,0.401844,39.585,-38.4939,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2880
  counters:
    num_agent_steps_sampled: 2880
    num_agent_steps_trained: 2880
    num_env_steps_sampled: 2880
    num_env_steps_trained: 2880
  custom_metrics: {}
  date: 2022-07-05_15-49-11
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.58496904304526
  episode_reward_mean: -0.32706529308027027
  episode_reward_min: -38.49390607994264
  episodes_this_iter: 4
  episodes_total: 960
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 7.3915241023930776
    episode_reward_mean: 7.3915241023930776
    episode_reward_min: 7.3915241023930776
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 7.3915241023930776
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 0.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,362,1423.34,2896,-0.0140595,39.585,-38.4939,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2912
  counters:
    num_agent_steps_sampled: 2912
    num_agent_steps_trained: 2912
    num_env_steps_sampled: 2912
    num_env_steps_trained: 2912
  custom_metrics: {}
  date: 2022-07-05_15-49-26
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.58496904304526
  episode_reward_mean: -0.0006720760715381813
  episode_reward_min: -38.49390607994264
  episodes_this_iter: 2
  episodes_total: 970
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3708910942077637
          entropy_coeff: 0.0
          kl: 6.043789198884042e-06
          model: {}
          policy_loss: -0.2093314826488495
          total_loss: 0.41649696230888367
          vf_explained_var: -0.0213793832808733
          vf_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,364,1430.99,2912,-0.000672076,39.585,-38.4939,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,364,1430.99,2912,-0.000672076,39.585,-38.4939,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2920
  counters:
    num_agent_steps_sampled: 2920
    num_agent_steps_trained: 2920
    num_env_steps_sampled: 2920
    num_env_steps_trained: 2920
  custom_metrics: {}
  date: 2022-07-05_15-49-32
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.58496904304526
  episode_reward_mean: -0.0148147377579992
  episode_reward_min: -38.49390607994264
  episodes_this_iter: 2
  episodes_total: 972
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -10.885323613313066
    episode_reward_mean: -10.885323613313066
    episode_reward_min: -10.885323613313066
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -10.885323613313066
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,367,1444.03,2936,-0.040871,39.585,-38.4939,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2952
  counters:
    num_agent_steps_sampled: 2952
    num_agent_steps_trained: 2952
    num_env_steps_sampled: 2952
    num_env_steps_trained: 2952
  custom_metrics: {}
  date: 2022-07-05_15-49-46
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.58496904304526
  episode_reward_mean: -0.014580134197786945
  episode_reward_min: -38.49390607994264
  episodes_this_iter: 4
  episodes_total: 984
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.3698456287384033
          entropy_coeff: 0.0
          kl: 3.0236476504796883e-06
          model: {}
          policy_loss: -0.36420345306396484
          total_loss: 0.22489997744560242
          vf_explained_var: 0.007012289948761463
          v

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,369,1451,2952,-0.0145801,39.585,-38.4939,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,369,1451,2952,-0.0145801,39.585,-38.4939,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2960
  counters:
    num_agent_steps_sampled: 2960
    num_agent_steps_trained: 2960
    num_env_steps_sampled: 2960
    num_env_steps_trained: 2960
  custom_metrics: {}
  date: 2022-07-05_15-49-52
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.58496904304526
  episode_reward_mean: -0.027727020439700496
  episode_reward_min: -38.49390607994264
  episodes_this_iter: 2
  episodes_total: 986
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: -20.699643907845267
    episode_reward_mean: -20.699643907845267
    episode_reward_min: -20.699643907845267
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - -20.699643907845267
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_m

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,372,1463.33,2976,0.00107276,39.585,-38.4939,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 2992
  counters:
    num_agent_steps_sampled: 2992
    num_agent_steps_trained: 2992
    num_env_steps_sampled: 2992
    num_env_steps_trained: 2992
  custom_metrics: {}
  date: 2022-07-05_15-50-05
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.58496904304526
  episode_reward_mean: -0.3294347366740775
  episode_reward_min: -38.49390607994264
  episodes_this_iter: 2
  episodes_total: 996
  experiment_id: 1ca6e38840c340479addb8dd7cbbc3be
  hostname: codah
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 9.999999747378752e-05
          entropy: 1.379133939743042
          entropy_coeff: 0.0
          kl: 1.2190264897071756e-05
          model: {}
          policy_loss: -0.2349664568901062
          total_loss: 0.11464598774909973
          vf_explained_var: 0.04200105741620064
          vf_los

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,374,1469.98,2992,-0.329435,39.585,-38.4939,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,RUNNING,100.37.253.28:560340,374,1469.98,2992,-0.329435,39.585,-38.4939,3


Result for PPOTrainer_compiler_gym_2c379_00000:
  agent_timesteps_total: 3000
  counters:
    num_agent_steps_sampled: 3000
    num_agent_steps_trained: 3000
    num_env_steps_sampled: 3000
    num_env_steps_trained: 3000
  custom_metrics: {}
  date: 2022-07-05_15-50-12
  done: true
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 39.58496904304526
  episode_reward_mean: 0.0710466199061537
  episode_reward_min: -38.49390607994264
  episodes_this_iter: 4
  episodes_total: 1000
  evaluation:
    custom_metrics: {}
    episode_len_mean: 3.0
    episode_media: {}
    episode_reward_max: 0.02068836848704647
    episode_reward_mean: 0.02068836848704647
    episode_reward_min: 0.02068836848704647
    episodes_this_iter: 1
    hist_stats:
      episode_lengths:
      - 3
      episode_reward:
      - 0.02068836848704647
    off_policy_estimator: {}
    policy_reward_max: {}
    policy_reward_mean: {}
    policy_reward_min: {}
    sampler_perf:
      mean_action_processing_ms: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_compiler_gym_2c379_00000,TERMINATED,100.37.253.28:560340,375,1476.08,3000,0.0710466,39.585,-38.4939,3


2022-07-05 15:50:12,878	INFO tune.py:747 -- Total run time: 1508.04 seconds (1507.31 seconds for the tuning loop).


In [11]:
checkpoint = analysis.get_best_checkpoint(
    metric="episode_reward_mean",
    mode="max",
    trial=analysis.trials[0]s
)

In [12]:
analysis.dataframe()

Unnamed: 0,episode_reward_max,episode_reward_min,episode_reward_mean,episode_len_mean,episodes_this_iter,num_healthy_workers,num_agent_steps_sampled,num_agent_steps_trained,num_env_steps_sampled,num_env_steps_trained,...,config/log_level,config/lr,config/model,config/num_workers,config/rollout_fragment_length,config/seed,config/sgd_minibatch_size,config/soft_horizon,config/train_batch_size,logdir
0,39.584969,-38.493906,0.071047,3.0,4,2,3000,3000,3000,3000,...,ERROR,0.0001,"{'fcnet_hiddens': [5, 5]}",2,5,204,5,True,5,/home/dejang/ray_results/PPOTrainer_2022-07-05...


In [13]:
trial = analysis.get_best_trial(metric="episode_reward_mean", mode="max")
log_dir = analysis.get_best_logdir(metric="episode_reward_mean", mode="max")
print(log_dir)

/home/dejang/ray_results/PPOTrainer_2022-07-05_15-25-04/PPOTrainer_compiler_gym_2c379_00000_0_2022-07-05_15-25-05


In [14]:
trial.stopping_criterion

{'episodes_total': 1000}

In [15]:
trial.metric_analysis

{'episode_reward_max': {'max': 39.58496904304526,
  'min': 0.016922388863661375,
  'avg': 37.35342323484236,
  'last': 39.58496904304526,
  'last-5-avg': 39.58496904304526,
  'last-10-avg': 39.584969043045255},
 'episode_reward_min': {'max': 0.0034574530022073446,
  'min': -39.82938783525944,
  'avg': -37.717964688154716,
  'last': -38.49390607994264,
  'last-5-avg': -38.49390607994264,
  'last-10-avg': -38.49390607994264},
 'episode_reward_mean': {'max': 7.186053079470235,
  'min': -0.6798548492744083,
  'avg': 0.08435085538460046,
  'last': 0.0710466199061537,
  'last-5-avg': 0.014979303357484,
  'last-10-avg': -0.07999296632692823},
 'episode_len_mean': {'max': 3.0,
  'min': 3.0,
  'avg': 3.0000000000000004,
  'last': 3.0,
  'last-5-avg': 3.0,
  'last-10-avg': 3.0},
 'episodes_this_iter': {'max': 4,
  'min': 2,
  'avg': 2.6666666666666665,
  'last': 4,
  'last-5-avg': 2.8,
  'last-10-avg': 2.8},
 'num_healthy_workers': {'max': 2,
  'min': 2,
  'avg': 1.9999999999999962,
  'last': 2,

In [16]:
trial.checkpoint.value

'/home/dejang/ray_results/PPOTrainer_2022-07-05_15-25-04/PPOTrainer_compiler_gym_2c379_00000_0_2022-07-05_15-25-05/checkpoint_000375/checkpoint-375'

In [17]:
import ray.rllib.agents.ppo as ppo
ppo.DEFAULT_CONFIG



{'num_workers': 2,
 'num_envs_per_worker': 1,
 'create_env_on_driver': False,
 'rollout_fragment_length': 200,
 'batch_mode': 'truncate_episodes',
 'gamma': 0.99,
 'lr': 5e-05,
 'train_batch_size': 4000,
 'model': {'_use_default_native_models': False,
  '_disable_preprocessor_api': False,
  '_disable_action_flattening': False,
  'fcnet_hiddens': [256, 256],
  'fcnet_activation': 'tanh',
  'conv_filters': None,
  'conv_activation': 'relu',
  'post_fcnet_hiddens': [],
  'post_fcnet_activation': 'relu',
  'free_log_std': False,
  'no_final_linear': False,
  'vf_share_layers': False,
  'use_lstm': False,
  'max_seq_len': 20,
  'lstm_cell_size': 256,
  'lstm_use_prev_action': False,
  'lstm_use_prev_reward': False,
  '_time_major': False,
  'use_attention': False,
  'attention_num_transformer_units': 1,
  'attention_dim': 64,
  'attention_num_heads': 1,
  'attention_head_dim': 32,
  'attention_memory_inference': 50,
  'attention_memory_training': 50,
  'attention_position_wise_mlp_dim': 32,

In [18]:
trial.checkpoint.value

'/home/dejang/ray_results/PPOTrainer_2022-07-05_15-25-04/PPOTrainer_compiler_gym_2c379_00000_0_2022-07-05_15-25-05/checkpoint_000375/checkpoint-375'

In [19]:
trial.config

{'log_level': 'ERROR',
 'seed': 204,
 'num_workers': 2,
 'env': 'compiler_gym',
 'rollout_fragment_length': 5,
 'train_batch_size': 5,
 'sgd_minibatch_size': 5,
 'gamma': 0.8,
 'lr': 0.0001,
 'horizon': 3,
 'soft_horizon': True,
 'evaluation_interval': 5,
 'evaluation_num_episodes': 1,
 'model': {'fcnet_hiddens': [5, 5]}}

In [20]:
trainer = PPOTrainer(config=trial.config, env="compiler_gym")
trainer.restore(trial.checkpoint.value)

2022-07-05 15:50:14,025	INFO trainer.py:2332 -- Your framework setting is 'tf', meaning you are using static-graph mode. Set framework='tf2' to enable eager execution with tf2.x. You may also then want to set eager_tracing=True in order to reach similar execution speed as with static-graph mode.
2022-07-05 15:50:14,030	INFO ppo.py:414 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
2022-07-05 15:50:14,031	INFO trainer.py:903 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(RolloutWorker pid=654916)[0m E0705 15:50:19.217818 140569179321920 example_service.py:263] CRITICAL - 
[2m[36m(RolloutWorker pid=654916)[0m 
[2m[36m(RolloutWorker pid=654916)[0m Working_dir = /dev/shm/compiler_gym_dejang/s/0705T155018-182728-0236
[2m[36m(RolloutWorker pid=654916)[0m 
[2m[36m(RolloutWorker pid=654915)[

In [21]:
model = trainer.get_policy().model
policy = trainer.get_policy()

In [22]:
model.model_config

{'_use_default_native_models': False,
 '_disable_preprocessor_api': False,
 '_disable_action_flattening': False,
 'fcnet_hiddens': [5, 5],
 'fcnet_activation': 'tanh',
 'conv_filters': None,
 'conv_activation': 'relu',
 'post_fcnet_hiddens': [],
 'post_fcnet_activation': 'relu',
 'free_log_std': False,
 'no_final_linear': False,
 'vf_share_layers': False,
 'use_lstm': False,
 'max_seq_len': 20,
 'lstm_cell_size': 256,
 'lstm_use_prev_action': False,
 'lstm_use_prev_reward': False,
 '_time_major': False,
 'use_attention': False,
 'attention_num_transformer_units': 1,
 'attention_dim': 64,
 'attention_num_heads': 1,
 'attention_head_dim': 32,
 'attention_memory_inference': 50,
 'attention_memory_training': 50,
 'attention_position_wise_mlp_dim': 32,
 'attention_init_gru_gate_bias': 2.0,
 'attention_use_n_prev_actions': 0,
 'attention_use_n_prev_rewards': 0,
 'framestack': True,
 'dim': 84,
 'grayscale': False,
 'zero_mean': True,
 'custom_model': None,
 'custom_model_config': {},
 'custo

In [23]:
def run_rollout (agent, env, n_iter=1, max_steps=5, verbose=False):
    """
    iterate through `n_iter` episodes in a rollout to emulate deployment in a production use case
    """
    for episode in range(n_iter):
        state = env.reset()
        sum_reward = 0

        for step in range(max_steps):
            try:
                
                action = int(agent.compute_single_action(state, explore=False))
                print(f"Compute action = {env.action_space.to_string(action)}")

                state, reward, done, info = env.step(action)
                sum_reward += reward
                print(f"Compute reward = {reward}")

                if verbose:
                    print("reward {:6.3f}  sum {:6.3f}".format(reward, sum_reward))
                    env.render()
            except Exception as e:
                print(f'----------------------------> Exception = {e}')
                break


        # report at the end of each episode
        print("CUMULATIVE REWARD:", round(sum_reward, 3), "\n")
        yield sum_reward


In [24]:
# trainer.restore(trial.checkpoint.value)
history = []
for episode_reward in run_rollout(trainer, env, verbose=False):
    history.append(episode_reward)
    
print("average reward:", round(sum(history) / len(history), 3))

E0705 15:50:25.025571 140184553244224 example_service.py:263] CRITICAL - 

Working_dir = /dev/shm/compiler_gym_dejang/s/0705T152501-249505-dbfd



Compute action = swap_down
Compute reward = -0.10795806245708439
Compute action = swap_down
Compute reward = -1.2608128186439285
Compute action = swap_down
Compute reward = -0.06702328541776437
Compute action = swap_down
Compute reward = -0.0016316470605532984
Compute action = swap_down
Compute reward = 0.006835741497085546
CUMULATIVE REWARD: -1.431 

average reward: -1.431


In [25]:
# If running in a notebook, finish the wandb run to upload the tensorboard logs to W&B
wandb.finish()
ray.shutdown()

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
ray/tune/agent_timesteps_total,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
ray/tune/counters/num_agent_steps_sampled,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
ray/tune/counters/num_agent_steps_trained,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
ray/tune/counters/num_env_steps_sampled,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
ray/tune/counters/num_env_steps_trained,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
ray/tune/done,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
ray/tune/episode_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
ray/tune/episode_reward_max,▁▄▄▄▅▅███▇████▇▇▇▇▄▄▄▅▅▅████▇▇▇▇▇▇▅▇▇▇██
ray/tune/episode_reward_mean,█▅▂▄▁▃▂▃▁▁▂▁▄▃▂▁▃▁▁▄▁▄▂▂▂▂▁▂▁▃▃▂▂▄▂▁▁▃▁▂

0,1
global_step,3000.0
ray/tune/agent_timesteps_total,3000.0
ray/tune/counters/num_agent_steps_sampled,3000.0
ray/tune/counters/num_agent_steps_trained,3000.0
ray/tune/counters/num_env_steps_sampled,3000.0
ray/tune/counters/num_env_steps_trained,3000.0
ray/tune/done,1.0
ray/tune/episode_len_mean,3.0
ray/tune/episode_reward_max,39.58497
ray/tune/episode_reward_mean,0.07105
