In [1]:
from gym.spaces import Box
import numpy as np

from gym.envs.classic_control import PendulumEnv

In [2]:
class StatelessPendulum(PendulumEnv):
    """Partially observable variant of the Pendulum gym environment.

    https://github.com/openai/gym/blob/master/gym/envs/classic_control/
    pendulum.py

    We delete the angular velocity component of the state, so that it
    can only be solved by a memory enhanced model (policy).
    """

    def __init__(self, config=None):
        config = config or {}
        g = config.get("g", 10.0)

        super().__init__(g=g)

        # Fix our observation-space (remove angular velocity component).
        high = np.array([1., 1.], dtype=np.float32)
        self.observation_space = Box(low=-high, high=high, dtype=np.float32)

    def step(self, action):
        next_obs, reward, done, info = super().step(action)
        # next_obs is [cos(theta), sin(theta), theta-dot (angular velocity)]
        return next_obs[:-1], reward, done, info

    def reset(self):
        init_obs = super().reset()
        # init_obs is [cos(theta), sin(theta), theta-dot (angular velocity)]
        return init_obs[:-1]


In [3]:
import numpy as np

import ray
from ray import tune
from ray.tune.registry import register_env
from ray.rllib.models import ModelCatalog




In [4]:
ray.init(num_cpus=0 or None)



{'node_ip_address': '172.18.0.3',
 'raylet_ip_address': '172.18.0.3',
 'redis_address': '172.18.0.3:48762',
 'object_store_address': '/tmp/ray/session_2021-12-09_14-27-56_974321_985/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-12-09_14-27-56_974321_985/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2021-12-09_14-27-56_974321_985',
 'metrics_export_port': 56289,
 'node_id': 'd6b5b0d47bcfc03d7e51529ccf9ca44cd3765ceefb7350068da1e0c4'}

In [6]:
register_env("StatelessPendulum", lambda _: StatelessPendulum())


In [7]:
env = StatelessPendulum()

In [8]:
env.reset()

array([ 0.07319583, -0.9973176 ], dtype=float32)

In [19]:
next_obs, reward, done, info = env.step([0.1])

In [20]:
reward

-2.2835522176273275

In [13]:
configs = {
    "PPO": {
        "num_sgd_iter": 5,
        "sgd_minibatch_size": 128, 
        "simple_optimizer": True,
        "model": {
            "vf_share_layers": True,
        },
        "vf_loss_coeff": 0.0001,
    },
    "IMPALA": {
        "num_workers": 2,
        "num_gpus": 0,
        "vf_loss_coeff": 0.01,
    },
}

In [14]:
config = dict(
    configs['PPO'],
    **{
        "env": 'StatelessPendulum',
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": 0,
        "model": {
            "use_lstm": True,
            "lstm_cell_size": 256,
            "lstm_use_prev_action": True,
            "lstm_use_prev_reward": True,
        },
        "framework": 'torch',
    })

In [15]:
stop = {
    "training_iteration": 40,
    "timesteps_total": 100000,
    "episode_reward_mean": 90.,
}

In [16]:
results = tune.run('PPO', config=config, stop=stop, verbose=2)



[2m[36m(PPO pid=1107)[0m 2021-12-09 14:35:01,632	INFO trainer.py:719 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.




Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=4000,timers={'sample_time_ms': 3829.51, 'sample_throughput': 1044.52, 'learn_time_ms': 4320.37, 'learn_throughput': 925.847, 'update_time_ms': 2.478},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 11.967570070786909, 'policy_loss': 0.025624675382718894, 'vf_loss': 119400.87341382576, 'vf_explained_var': 0.0015878088546521737, 'kl': 0.009291289999347379, 'entropy': 1.2064687035300514, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 4000, 'num_agent_steps_sampled': 4000, 'num_steps_trained': 4000

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=8000,timers={'sample_time_ms': 6100.037, 'sample_throughput': 655.734, 'learn_time_ms': 4252.824, 'learn_throughput': 940.551, 'update_time_ms': 2.363},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 12.090836409366492, 'policy_loss': 0.035774418372999536, 'vf_loss': 120528.66586174243, 'vf_explained_var': 0.004245922059723825, 'kl': 0.010978210507715726, 'entropy': 1.2338258078604034, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 8000, 'num_agent_steps_sampled': 8000, 'num_steps_trained': 800

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=12000,timers={'sample_time_ms': 6777.558, 'sample_throughput': 590.183, 'learn_time_ms': 4239.878, 'learn_throughput': 943.423, 'update_time_ms': 2.369},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 11.179909732125022, 'policy_loss': 0.031021384276788345, 'vf_loss': 111465.52897727273, 'vf_explained_var': -0.003942189072117661, 'kl': 0.01167905356410456, 'entropy': 1.1489107066934758, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 12000, 'num_agent_steps_sampled': 12000, 'num_steps_trained': 

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=16000,timers={'sample_time_ms': 7085.333, 'sample_throughput': 564.547, 'learn_time_ms': 4193.89, 'learn_throughput': 953.768, 'update_time_ms': 2.339},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 12.968038220839066, 'policy_loss': 0.0284602704373273, 'vf_loss': 129383.3165719697, 'vf_explained_var': -0.0007770122903766055, 'kl': 0.006232759871901448, 'entropy': 1.0871253627719302, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 16000, 'num_agent_steps_sampled': 16000, 'num_steps_trained': 16

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=20000,timers={'sample_time_ms': 7211.503, 'sample_throughput': 554.669, 'learn_time_ms': 4159.755, 'learn_throughput': 961.595, 'update_time_ms': 2.311},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 12.733311320796158, 'policy_loss': 0.02919377993905183, 'vf_loss': 127030.81441761364, 'vf_explained_var': -0.00017228668386285955, 'kl': 0.005182849029855706, 'entropy': 1.101795109835538, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 20000, 'num_agent_steps_sampled': 20000, 'num_steps_trained':

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=24000,timers={'sample_time_ms': 7309.432, 'sample_throughput': 547.238, 'learn_time_ms': 4161.942, 'learn_throughput': 961.09, 'update_time_ms': 2.28},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 11.760948418125961, 'policy_loss': 0.022649290254621795, 'vf_loss': 117375.89292140151, 'vf_explained_var': -8.421305454138553e-05, 'kl': 0.0035502281227922225, 'entropy': 1.1013051502632372, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 24000, 'num_agent_steps_sampled': 24000, 'num_steps_trained':

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=28000,timers={'sample_time_ms': 7414.797, 'sample_throughput': 539.462, 'learn_time_ms': 4159.726, 'learn_throughput': 961.602, 'update_time_ms': 2.266},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.09999999999999998, 'cur_lr': 5e-05, 'total_loss': 11.972091949347293, 'policy_loss': 0.02172802330530954, 'vf_loss': 119496.81455965909, 'vf_explained_var': -3.620566743792909e-05, 'kl': 0.006827161160508281, 'entropy': 1.1510068445494681, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 28000, 'num_agent_steps_sampled': 28000, 'num_steps_trained':

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=32000,timers={'sample_time_ms': 7468.689, 'sample_throughput': 535.569, 'learn_time_ms': 4166.908, 'learn_throughput': 959.944, 'update_time_ms': 2.277},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.09999999999999998, 'cur_lr': 5e-05, 'total_loss': 10.069404498013583, 'policy_loss': 0.03344017544930632, 'vf_loss': 100356.19914772727, 'vf_explained_var': 4.482630527380741e-06, 'kl': 0.0034472581487292966, 'entropy': 1.1721658085331772, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 32000, 'num_agent_steps_sampled': 32000, 'num_steps_trained':

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=36000,timers={'sample_time_ms': 7534.374, 'sample_throughput': 530.9, 'learn_time_ms': 4162.39, 'learn_throughput': 960.986, 'update_time_ms': 2.255},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.04999999999999999, 'cur_lr': 5e-05, 'total_loss': 10.100872429934414, 'policy_loss': 0.02703474255448038, 'vf_loss': 100736.57964015151, 'vf_explained_var': 5.287835092255564e-06, 'kl': 0.003602371192083749, 'entropy': 1.182693695299553, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 36000, 'num_agent_steps_sampled': 36000, 'num_steps_trained': 3600

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=40000,timers={'sample_time_ms': 7586.82, 'sample_throughput': 527.23, 'learn_time_ms': 4168.265, 'learn_throughput': 959.632, 'update_time_ms': 2.256},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.024999999999999994, 'cur_lr': 5e-05, 'total_loss': 10.013240753520618, 'policy_loss': 0.027151027062174045, 'vf_loss': 99859.10490056819, 'vf_explained_var': 4.741856546112985e-05, 'kl': 0.007181683168628307, 'entropy': 1.1170763102444736, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 40000, 'num_agent_steps_sampled': 40000, 'num_steps_trained': 4

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=44000,timers={'sample_time_ms': 8002.316, 'sample_throughput': 499.855, 'learn_time_ms': 4156.095, 'learn_throughput': 962.442, 'update_time_ms': 2.217},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.024999999999999994, 'cur_lr': 5e-05, 'total_loss': 9.870027484315814, 'policy_loss': 0.027586115529817164, 'vf_loss': 98423.72961647727, 'vf_explained_var': -3.437417926210346e-05, 'kl': 0.0027472872610711766, 'entropy': 1.0748236334685124, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 44000, 'num_agent_steps_sampled': 44000, 'num_steps_trained'

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=48000,timers={'sample_time_ms': 7961.398, 'sample_throughput': 502.424, 'learn_time_ms': 4153.858, 'learn_throughput': 962.96, 'update_time_ms': 2.218},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.012499999999999997, 'cur_lr': 5e-05, 'total_loss': 10.009270526423599, 'policy_loss': 0.02945340009349765, 'vf_loss': 99797.56001420454, 'vf_explained_var': 4.647717331395004e-06, 'kl': 0.00491361911229676, 'entropy': 1.1054355469616977, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 48000, 'num_agent_steps_sampled': 48000, 'num_steps_trained': 48

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=52000,timers={'sample_time_ms': 7936.552, 'sample_throughput': 503.997, 'learn_time_ms': 4141.555, 'learn_throughput': 965.821, 'update_time_ms': 2.259},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.006249999999999999, 'cur_lr': 5e-05, 'total_loss': 9.016809475060665, 'policy_loss': 0.02496666511040971, 'vf_loss': 89918.11503314394, 'vf_explained_var': -4.016738949400006e-05, 'kl': 0.005049980648156382, 'entropy': 1.0688065474683588, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 52000, 'num_agent_steps_sampled': 52000, 'num_steps_trained': 

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=56000,timers={'sample_time_ms': 7940.821, 'sample_throughput': 503.726, 'learn_time_ms': 4141.67, 'learn_throughput': 965.794, 'update_time_ms': 2.265},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.006249999999999999, 'cur_lr': 5e-05, 'total_loss': 9.561970836466008, 'policy_loss': 0.02693808215077628, 'vf_loss': 95350.01453598485, 'vf_explained_var': 1.72745097767223e-06, 'kl': 0.005049899711859057, 'entropy': 1.0873664603088842, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 56000, 'num_agent_steps_sampled': 56000, 'num_steps_trained': 560

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=60000,timers={'sample_time_ms': 7964.698, 'sample_throughput': 502.216, 'learn_time_ms': 4159.794, 'learn_throughput': 961.586, 'update_time_ms': 2.263},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.006249999999999999, 'cur_lr': 5e-05, 'total_loss': 9.117169422091859, 'policy_loss': 0.025854801234196534, 'vf_loss': 90912.50828598485, 'vf_explained_var': 1.9561883175011837e-05, 'kl': 0.010253446353565013, 'entropy': 1.0902643644448482, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 60000, 'num_agent_steps_sampled': 60000, 'num_steps_trained':

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=64000,timers={'sample_time_ms': 7981.318, 'sample_throughput': 501.17, 'learn_time_ms': 4151.477, 'learn_throughput': 963.513, 'update_time_ms': 2.278},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.006249999999999999, 'cur_lr': 5e-05, 'total_loss': 7.656306658369122, 'policy_loss': 0.0289711313717293, 'vf_loss': 76272.9757220644, 'vf_explained_var': -2.7779376868045693e-06, 'kl': 0.00610656323200695, 'entropy': 1.0810565962935939, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 64000, 'num_agent_steps_sampled': 64000, 'num_steps_trained': 640

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=68000,timers={'sample_time_ms': 7985.096, 'sample_throughput': 500.933, 'learn_time_ms': 4145.993, 'learn_throughput': 964.787, 'update_time_ms': 2.262},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.006249999999999999, 'cur_lr': 5e-05, 'total_loss': 6.7856757915381225, 'policy_loss': 0.016987755798706504, 'vf_loss': 67686.56678503788, 'vf_explained_var': 4.8502286275227865e-05, 'kl': 0.005044563717363174, 'entropy': 1.0862347588394627, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 68000, 'num_agent_steps_sampled': 68000, 'num_steps_trained'

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=72000,timers={'sample_time_ms': 8027.26, 'sample_throughput': 498.302, 'learn_time_ms': 4140.513, 'learn_throughput': 966.064, 'update_time_ms': 2.237},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.006249999999999999, 'cur_lr': 5e-05, 'total_loss': 7.055728327144276, 'policy_loss': 0.025546492331407288, 'vf_loss': 70301.44382102272, 'vf_explained_var': 7.155122178973574e-05, 'kl': 0.0060151863851700006, 'entropy': 1.0381105910647999, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 72000, 'num_agent_steps_sampled': 72000, 'num_steps_trained': 

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=76000,timers={'sample_time_ms': 8030.146, 'sample_throughput': 498.123, 'learn_time_ms': 4155.001, 'learn_throughput': 962.695, 'update_time_ms': 2.247},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.006249999999999999, 'cur_lr': 5e-05, 'total_loss': 5.918443610451439, 'policy_loss': 0.02444724233990366, 'vf_loss': 58939.523508522725, 'vf_explained_var': 1.241582812684955e-05, 'kl': 0.007058836750300837, 'entropy': 1.0107924663659298, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 76000, 'num_agent_steps_sampled': 76000, 'num_steps_trained': 

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=80000,timers={'sample_time_ms': 8016.339, 'sample_throughput': 498.981, 'learn_time_ms': 4144.318, 'learn_throughput': 965.177, 'update_time_ms': 2.248},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.006249999999999999, 'cur_lr': 5e-05, 'total_loss': 6.098720208081332, 'policy_loss': 0.024597947048305563, 'vf_loss': 60740.894294507576, 'vf_explained_var': -5.319551988081499e-05, 'kl': 0.005274355254893303, 'entropy': 0.9892216046651204, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 80000, 'num_agent_steps_sampled': 80000, 'num_steps_trained'

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=84000,timers={'sample_time_ms': 8016.955, 'sample_throughput': 498.943, 'learn_time_ms': 4139.511, 'learn_throughput': 966.298, 'update_time_ms': 2.253},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.006249999999999999, 'cur_lr': 5e-05, 'total_loss': 5.303510966445461, 'policy_loss': 0.030519427850165152, 'vf_loss': 52729.32091619318, 'vf_explained_var': -0.0002134846918510668, 'kl': 0.009533050291430409, 'entropy': 0.9723085746620641, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 84000, 'num_agent_steps_sampled': 84000, 'num_steps_trained':

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=88000,timers={'sample_time_ms': 8009.678, 'sample_throughput': 499.396, 'learn_time_ms': 4165.981, 'learn_throughput': 960.158, 'update_time_ms': 2.254},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.006249999999999999, 'cur_lr': 5e-05, 'total_loss': 5.315248922145728, 'policy_loss': 0.028462743437425657, 'vf_loss': 52867.7306936553, 'vf_explained_var': -3.028710683186849e-05, 'kl': 0.0021120667625597537, 'entropy': 0.9669447862740719, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 88000, 'num_agent_steps_sampled': 88000, 'num_steps_trained':

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=92000,timers={'sample_time_ms': 8068.821, 'sample_throughput': 495.735, 'learn_time_ms': 4166.963, 'learn_throughput': 959.932, 'update_time_ms': 2.191},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.0031249999999999993, 'cur_lr': 5e-05, 'total_loss': 5.457133235353412, 'policy_loss': 0.024555991623889318, 'vf_loss': 54325.66075994318, 'vf_explained_var': -7.848667375969165e-06, 'kl': 0.003624000229419886, 'entropy': 0.9445365515622226, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 92000, 'num_agent_steps_sampled': 92000, 'num_steps_trained'

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=96000,timers={'sample_time_ms': 8054.071, 'sample_throughput': 496.643, 'learn_time_ms': 4168.537, 'learn_throughput': 959.569, 'update_time_ms': 2.172},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.0015624999999999997, 'cur_lr': 5e-05, 'total_loss': 5.4939002932924215, 'policy_loss': 0.024137740373385674, 'vf_loss': 54697.492199337124, 'vf_explained_var': 2.9766920841101442e-05, 'kl': 0.008610114820036022, 'entropy': 0.9873132958556666, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 96000, 'num_agent_steps_sampled': 96000, 'num_steps_traine

Trial PPO_StatelessPendulum_ce305_00000 reported episode_reward_max=nan,episode_reward_min=nan,episode_reward_mean=nan,episode_len_mean=nan,episode_media={},episodes_this_iter=0,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=100000,timers={'sample_time_ms': 8054.903, 'sample_throughput': 496.592, 'learn_time_ms': 4172.287, 'learn_throughput': 958.707, 'update_time_ms': 2.167},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.0015624999999999997, 'cur_lr': 5e-05, 'total_loss': 5.3703459450692845, 'policy_loss': 0.01972379618409005, 'vf_loss': 53506.06985085227, 'vf_explained_var': -3.84583617701675e-05, 'kl': 0.00980129351949309, 'entropy': 0.8882211869413202, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}}}, 'num_steps_sampled': 100000, 'num_agent_steps_sampled': 100000, 'num_steps_trained

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_StatelessPendulum_ce305_00000,TERMINATED,172.18.0.3:1107,25,200.292,100000,,,,


2021-12-09 14:38:24,287	INFO tune.py:626 -- Total run time: 204.67 seconds (204.50 seconds for the tuning loop).
