In [1]:
from gym.spaces import Box
import numpy as np

from gym.envs.classic_control import CartPoleEnv

In [2]:
class StatelessCartPole(CartPoleEnv):
    """Partially observable variant of the CartPole gym environment.

    https://github.com/openai/gym/blob/master/gym/envs/classic_control/
    cartpole.py

    We delete the x- and angular velocity components of the state, so that it
    can only be solved by a memory enhanced model (policy).
    """

    def __init__(self, config=None):
        super().__init__()

        # Fix our observation-space (remove 2 velocity components).
        high = np.array(
            [
                self.x_threshold * 2,
                self.theta_threshold_radians * 2,
            ],
            dtype=np.float32)

        self.observation_space = Box(low=-high, high=high, dtype=np.float32)

    def step(self, action):
        next_obs, reward, done, info = super().step(action)
        # next_obs is [x-pos, x-veloc, angle, angle-veloc]
        return np.array([next_obs[0], next_obs[2]]), reward, done, info

    def reset(self):
        init_obs = super().reset()
        # init_obs is [x-pos, x-veloc, angle, angle-veloc]
        return np.array([init_obs[0], init_obs[2]])


In [3]:
import numpy as np

import ray
from ray import tune
from ray.tune.registry import register_env
from ray.rllib.models import ModelCatalog




In [4]:
ray.init(num_cpus=0 or None)



{'node_ip_address': '172.18.0.3',
 'raylet_ip_address': '172.18.0.3',
 'redis_address': '172.18.0.3:20885',
 'object_store_address': '/tmp/ray/session_2021-12-09_14-46-08_525733_1744/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-12-09_14-46-08_525733_1744/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2021-12-09_14-46-08_525733_1744',
 'metrics_export_port': 56606,
 'node_id': 'ef0a3efd1e0896fd3cc66683fe40c03504d0390811079bdf301d0c9a'}

In [5]:
register_env("StatelessCartPole", lambda _: StatelessCartPole())


In [6]:
env = StatelessCartPole()

In [7]:
env.reset()

array([-0.034167  ,  0.01839731], dtype=float32)

In [10]:
next_obs, reward, done, info = env.step(1)

In [11]:
reward

1.0

In [12]:
configs = {
    "PPO": {
        "num_sgd_iter": 5,
        "sgd_minibatch_size": 128, 
        "simple_optimizer": True,
        "model": {
            "vf_share_layers": True,
        },
        "vf_loss_coeff": 0.0001,
    },
    "IMPALA": {
        "num_workers": 2,
        "num_gpus": 0,
        "vf_loss_coeff": 0.01,
    },
}

In [13]:
config = dict(
    configs['PPO'],
    **{
        "env": 'StatelessCartPole',
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": 0,
        "model": {
            "use_lstm": True,
            "lstm_cell_size": 256,
            "lstm_use_prev_action": True,
            "lstm_use_prev_reward": True,
        },
        "framework": 'torch',
    })

In [14]:
stop = {
    "training_iteration": 40,
    "timesteps_total": 100000,
    "episode_reward_mean": 90.,
}

In [15]:
results = tune.run('PPO', config=config, stop=stop, verbose=2)

2021-12-09 14:47:06,875	INFO logger.py:605 -- pip install "ray[tune]" to see TensorBoard files.
2021-12-09 14:47:07,171	ERROR syncer.py:111 -- Log sync requires rsync to be installed.
[2m[36m(PPO pid=1875)[0m 2021-12-09 14:47:08,985	INFO trainer.py:719 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.






Trial PPO_StatelessCartPole_7faac_00000 reported episode_reward_max=94.0,episode_reward_min=9.0,episode_reward_mean=22.3876404494382,episode_len_mean=22.3876404494382,episode_media={},episodes_this_iter=178,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.14308004126675067, 'mean_inference_ms': 1.6070663899198163, 'mean_action_processing_ms': 0.06331419241779868, 'mean_env_wait_ms': 0.08095579704959532, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=4000,timers={'sample_time_ms': 4220.256, 'sample_throughput': 947.81, 'learn_time_ms': 4569.073, 'learn_throughput': 875.451, 'update_time_ms': 2.438},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': -0.019445178443283747, 'policy_loss': -0.047823909379428986, 'vf_loss': 260.6659672592626, 'vf_ex

Trial PPO_StatelessCartPole_7faac_00000 reported episode_reward_max=128.0,episode_reward_min=9.0,episode_reward_mean=31.753968253968253,episode_len_mean=31.753968253968253,episode_media={},episodes_this_iter=126,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.137668463281855, 'mean_inference_ms': 1.5973858482528005, 'mean_action_processing_ms': 0.06260232404089879, 'mean_env_wait_ms': 0.07955779022438238, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=8000,timers={'sample_time_ms': 6414.553, 'sample_throughput': 623.582, 'learn_time_ms': 4485.676, 'learn_throughput': 891.727, 'update_time_ms': 2.579},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 0.009039733389561827, 'policy_loss': -0.03391761516757084, 'vf_loss': 412.0120442708333, 'vf_

Trial PPO_StatelessCartPole_7faac_00000 reported episode_reward_max=145.0,episode_reward_min=10.0,episode_reward_mean=40.31,episode_len_mean=40.31,episode_media={},episodes_this_iter=97,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.1348987013830322, 'mean_inference_ms': 1.6001005780616608, 'mean_action_processing_ms': 0.06258057852834059, 'mean_env_wait_ms': 0.0795256211103808, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=12000,timers={'sample_time_ms': 7091.215, 'sample_throughput': 564.078, 'learn_time_ms': 4436.235, 'learn_throughput': 901.666, 'update_time_ms': 2.426},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 0.02469274295550404, 'policy_loss': -0.044319790601730344, 'vf_loss': 674.2920326926492, 'vf_explained_var': -0.004970

Trial PPO_StatelessCartPole_7faac_00000 reported episode_reward_max=242.0,episode_reward_min=11.0,episode_reward_mean=57.99,episode_len_mean=57.99,episode_media={},episodes_this_iter=53,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.13310762738466544, 'mean_inference_ms': 1.5942089366192982, 'mean_action_processing_ms': 0.0621789681963732, 'mean_env_wait_ms': 0.0789814252846734, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=16000,timers={'sample_time_ms': 7422.193, 'sample_throughput': 538.924, 'learn_time_ms': 4427.337, 'learn_throughput': 903.478, 'update_time_ms': 2.457},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 0.09632529988884926, 'policy_loss': -0.03521677722420656, 'vf_loss': 1293.2174002907493, 'vf_explained_var': -0.058676

Trial PPO_StatelessCartPole_7faac_00000 reported episode_reward_max=242.0,episode_reward_min=12.0,episode_reward_mean=67.59,episode_len_mean=67.59,episode_media={},episodes_this_iter=63,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.1302822078620692, 'mean_inference_ms': 1.5914300532001175, 'mean_action_processing_ms': 0.06210568116070455, 'mean_env_wait_ms': 0.07875218459444656, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=20000,timers={'sample_time_ms': 7611.563, 'sample_throughput': 525.516, 'learn_time_ms': 4393.91, 'learn_throughput': 910.351, 'update_time_ms': 2.398},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 0.037750435823743995, 'policy_loss': -0.04029302746621948, 'vf_loss': 758.5688905195757, 'vf_explained_var': -0.171624

Trial PPO_StatelessCartPole_7faac_00000 reported episode_reward_max=278.0,episode_reward_min=13.0,episode_reward_mean=81.98,episode_len_mean=81.98,episode_media={},episodes_this_iter=30,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.12916824556692064, 'mean_inference_ms': 1.5902848562795675, 'mean_action_processing_ms': 0.062064887564958246, 'mean_env_wait_ms': 0.07871204247497782, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=24000,timers={'sample_time_ms': 7724.163, 'sample_throughput': 517.855, 'learn_time_ms': 4359.218, 'learn_throughput': 917.596, 'update_time_ms': 2.371},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 0.08713296892855203, 'policy_loss': -0.022104012762958352, 'vf_loss': 1066.41193819913, 'vf_explained_var': -0.0988

Trial PPO_StatelessCartPole_7faac_00000 reported episode_reward_max=278.0,episode_reward_min=13.0,episode_reward_mean=100.31,episode_len_mean=100.31,episode_media={},episodes_this_iter=34,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.12835189509370581, 'mean_inference_ms': 1.5902664262032473, 'mean_action_processing_ms': 0.06202862545197946, 'mean_env_wait_ms': 0.07865247430877828, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=28000,timers={'sample_time_ms': 7796.922, 'sample_throughput': 513.023, 'learn_time_ms': 4346.645, 'learn_throughput': 920.25, 'update_time_ms': 2.362},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 0.09229471022545388, 'policy_loss': -0.0089716349587296, 'vf_loss': 992.6105527010831, 'vf_explained_var': -0.15230

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_StatelessCartPole_7faac_00000,TERMINATED,172.18.0.3:1875,7,58.8009,28000,100.31,278,13,100.31


2021-12-09 14:48:09,975	INFO tune.py:626 -- Total run time: 63.10 seconds (62.90 seconds for the tuning loop).
