In [1]:
import gym
import math
import numpy as np
from gym import spaces
from gym.utils import seeding

In [2]:
class StatelessCartPoleMD(gym.Env):
    """Partially observable variant of the CartPole gym environment.

    https://github.com/openai/gym/blob/master/gym/envs/classic_control/
    cartpole.py

    We delete the velocity component of the state, so that it can only be
    solved by a LSTM policy.
    """

    metadata = {
        "render.modes": ["human", "rgb_array"],
        "video.frames_per_second": 60
    }

    def __init__(self, config=None):
        self.gravity = 9.8
        self.masscart = 1.0
        self.masspole = 0.1
        self.total_mass = (self.masspole + self.masscart)
        self.length = 0.5  # actually half the pole's length
        self.polemass_length = (self.masspole * self.length)
        self.force_mag = 10.0
        self.tau = 0.02  # seconds between state updates

        # Angle at which to fail the episode
        self.theta_threshold_radians = 12 * 2 * math.pi / 360
        self.x_threshold = 2.4

        high = np.array([
            self.x_threshold * 2,
            self.theta_threshold_radians * 2,
        ])

        self.action_space = spaces.MultiDiscrete([2,2])
        self.observation_space = spaces.Box(-high, high)

        self.seed()
        self.viewer = None
        self.state = None

        self.steps_beyond_done = None

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, action):
        assert self.action_space.contains(
            action), "%r (%s) invalid" % (action, type(action))
        action = action[0] and action[1]
        state = self.state
        x, x_dot, theta, theta_dot = state
        force = self.force_mag if action == 1 else -self.force_mag
        costheta = math.cos(theta)
        sintheta = math.sin(theta)
        temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta
                ) / self.total_mass
        thetaacc = (self.gravity * sintheta - costheta * temp) / (
            self.length *
            (4.0 / 3.0 - self.masspole * costheta * costheta / self.total_mass)
        )
        xacc = (temp -
                self.polemass_length * thetaacc * costheta / self.total_mass)
        x = x + self.tau * x_dot
        x_dot = x_dot + self.tau * xacc
        theta = theta + self.tau * theta_dot
        theta_dot = theta_dot + self.tau * thetaacc
        self.state = (x, x_dot, theta, theta_dot)
        done = (x < -self.x_threshold or x > self.x_threshold
                or theta < -self.theta_threshold_radians
                or theta > self.theta_threshold_radians)
        done = bool(done)

        if not done:
            reward = 1.0
        elif self.steps_beyond_done is None:
            # Pole just fell!
            self.steps_beyond_done = 0
            reward = 1.0
        else:
            self.steps_beyond_done += 1
            reward = 0.0

        rv = np.r_[self.state[0], self.state[2]]
        return rv, reward, done, {}

    def reset(self):
        self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4, ))
        self.steps_beyond_done = None

        rv = np.r_[self.state[0], self.state[2]]
        return rv

    def render(self, mode="human"):
        screen_width = 600
        screen_height = 400

        world_width = self.x_threshold * 2
        scale = screen_width / world_width
        carty = 100  # TOP OF CART
        polewidth = 10.0
        polelen = scale * 1.0
        cartwidth = 50.0
        cartheight = 30.0

        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.Viewer(screen_width, screen_height)
            l, r, t, b = (-cartwidth / 2, cartwidth / 2, cartheight / 2,
                          -cartheight / 2)
            axleoffset = cartheight / 4.0
            cart = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
            self.carttrans = rendering.Transform()
            cart.add_attr(self.carttrans)
            self.viewer.add_geom(cart)
            l, r, t, b = (-polewidth / 2, polewidth / 2,
                          polelen - polewidth / 2, -polewidth / 2)
            pole = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
            pole.set_color(.8, .6, .4)
            self.poletrans = rendering.Transform(translation=(0, axleoffset))
            pole.add_attr(self.poletrans)
            pole.add_attr(self.carttrans)
            self.viewer.add_geom(pole)
            self.axle = rendering.make_circle(polewidth / 2)
            self.axle.add_attr(self.poletrans)
            self.axle.add_attr(self.carttrans)
            self.axle.set_color(.5, .5, .8)
            self.viewer.add_geom(self.axle)
            self.track = rendering.Line((0, carty), (screen_width, carty))
            self.track.set_color(0, 0, 0)
            self.viewer.add_geom(self.track)

        if self.state is None:
            return None

        x = self.state
        cartx = x[0] * scale + screen_width / 2.0  # MIDDLE OF CART
        self.carttrans.set_translation(cartx, carty)
        self.poletrans.set_rotation(-x[2])

        return self.viewer.render(return_rgb_array=mode == "rgb_array")

    def close(self):
        if self.viewer:
            self.viewer.close()


In [3]:
import argparse
import os
from ray.rllib.utils.test_utils import check_learning_achieved

In [4]:
parser = argparse.ArgumentParser()
parser.add_argument(
    "--run",
    type=str,
    default="PPO",
    help="The RLlib-registered algorithm to use.")
parser.add_argument("--num-cpus", type=int, default=0)
parser.add_argument(
    "--framework",
    choices=["tf", "tf2", "tfe", "torch"],
    default="torch",
    help="The DL framework specifier.")
parser.add_argument("--eager-tracing", action="store_true")
parser.add_argument("--use-prev-action", action="store_true")
parser.add_argument("--use-prev-reward", action="store_true")
parser.add_argument(
    "--as-test",
    action="store_true",
    help="Whether this script should be run as a test: --stop-reward must "
    "be achieved within --stop-timesteps AND --stop-iters.")
parser.add_argument(
    "--stop-iters",
    type=int,
    default=200,
    help="Number of iterations to train.")
parser.add_argument(
    "--stop-timesteps",
    type=int,
    default=100000,
    help="Number of timesteps to train.")
parser.add_argument(
    "--stop-reward",
    type=float,
    default=150.0,
    help="Reward at which we stop training.")

_StoreAction(option_strings=['--stop-reward'], dest='stop_reward', nargs=None, const=None, default=150.0, type=<class 'float'>, choices=None, help='Reward at which we stop training.', metavar=None)

In [5]:
import ray
from ray import tune

args = parser.parse_args("--stop-iters=10 --use-prev-action --use-prev-reward".split())

ray.init(num_cpus=args.num_cpus or None)

configs = {
    "PPO": {
        "num_sgd_iter": 5,
        "sgd_minibatch_size": 128,
        "simple_optimizer": True,        
        "model": {
            "vf_share_layers": True,
        },
        "vf_loss_coeff": 0.0001,
    },
    "IMPALA": {
        "num_workers": 2,
        "num_gpus": 0,
        "vf_loss_coeff": 0.01,
    },
}

config = dict(
    configs[args.run],
    **{
        "env": StatelessCartPoleMD,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "model": {
            "use_lstm": True,
            "lstm_cell_size": 256,
            "lstm_use_prev_action": args.use_prev_action,
            "lstm_use_prev_reward": args.use_prev_reward,
        },
        "framework": args.framework,
        # Run with tracing enabled for tfe/tf2?
        "eager_tracing": args.eager_tracing,
    })

stop = {
    "training_iteration": args.stop_iters,
    "timesteps_total": args.stop_timesteps,
    "episode_reward_mean": args.stop_reward,
}

# To run the Trainer without tune.run, using our LSTM model and
# manual state-in handling, do the following:

# Example (use `config` from the above code):
# >> import numpy as np
# >> from ray.rllib.agents.ppo import PPOTrainer
# >>
# >> trainer = PPOTrainer(config)
# >> lstm_cell_size = config["model"]["lstm_cell_size"]
# >> env = StatelessCartPole()
# >> obs = env.reset()
# >>
# >> # range(2) b/c h- and c-states of the LSTM.
# >> init_state = state = [
# ..     np.zeros([lstm_cell_size], np.float32) for _ in range(2)
# .. ]
# >> prev_a = 0
# >> prev_r = 0.0
# >>
# >> while True:
# >>     a, state_out, _ = trainer.compute_single_action(
# ..         obs, state, prev_a, prev_r)
# >>     obs, reward, done, _ = env.step(a)
# >>     if done:
# >>         obs = env.reset()
# >>         state = init_state
# >>         prev_a = 0
# >>         prev_r = 0.0
# >>     else:
# >>         state = state_out
# >>         prev_a = a
# >>         prev_r = reward



In [6]:
config

{'num_sgd_iter': 5,
 'sgd_minibatch_size': 128,
 'simple_optimizer': True,
 'model': {'use_lstm': True,
  'lstm_cell_size': 256,
  'lstm_use_prev_action': True,
  'lstm_use_prev_reward': True},
 'vf_loss_coeff': 0.0001,
 'env': __main__.StatelessCartPoleMD,
 'num_gpus': 0,
 'framework': 'torch',
 'eager_tracing': False}

In [7]:
results = tune.run(args.run, config=config, stop=stop, verbose=2, checkpoint_at_end=True)

2021-11-17 09:50:21,427	ERROR syncer.py:75 -- Log sync requires rsync to be installed.
[2m[36m(pid=6081)[0m 2021-11-17 09:50:23,059	INFO trainer.py:770 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=6086)[0m   logger.warn(


[2m[36m(pid=6088)[0m   logger.warn(




Trial PPO_StatelessCartPoleMD_65802_00000 reported episode_reward_max=33.0,episode_reward_min=8.0,episode_reward_mean=13.537414965986395,episode_len_mean=13.537414965986395,episode_media={},episodes_this_iter=294,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.1510864571064395, 'mean_inference_ms': 1.8794677341242545, 'mean_action_processing_ms': 0.049682679641403514, 'mean_env_wait_ms': 0.11156463169791632, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=4000,timers={'sample_time_ms': 4879.835, 'sample_throughput': 819.7, 'learn_time_ms': 4524.774, 'learn_throughput': 884.022, 'update_time_ms': 5.0},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999998, 'cur_lr': 5.000000000000001e-05, 'total_loss': -0.05162774252821691, 'policy_loss': -0.06205586806172505, 'vf_loss': 77.98367

Trial PPO_StatelessCartPoleMD_65802_00000 reported episode_reward_max=67.0,episode_reward_min=8.0,episode_reward_mean=16.56611570247934,episode_len_mean=16.56611570247934,episode_media={},episodes_this_iter=242,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.14670172101926773, 'mean_inference_ms': 1.8799829329773303, 'mean_action_processing_ms': 0.049307438186076635, 'mean_env_wait_ms': 0.11051055798849536, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=8000,timers={'sample_time_ms': 7068.638, 'sample_throughput': 565.88, 'learn_time_ms': 4511.547, 'learn_throughput': 886.614, 'update_time_ms': 3.989},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': -0.05222726471609238, 'policy_loss': -0.07017515576021238, 'vf_loss': 150.16884626908737, 'v

Trial PPO_StatelessCartPoleMD_65802_00000 reported episode_reward_max=64.0,episode_reward_min=8.0,episode_reward_mean=20.625,episode_len_mean=20.625,episode_media={},episodes_this_iter=192,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.14268371957292536, 'mean_inference_ms': 1.8834917131665085, 'mean_action_processing_ms': 0.04926111584523383, 'mean_env_wait_ms': 0.11045039560916271, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=12000,timers={'sample_time_ms': 7756.377, 'sample_throughput': 515.705, 'learn_time_ms': 4522.345, 'learn_throughput': 884.497, 'update_time_ms': 3.375},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': -0.015369668642454076, 'policy_loss': -0.037946478370577096, 'vf_loss': 196.2572309551817, 'vf_explained_var': -0

Trial PPO_StatelessCartPoleMD_65802_00000 reported episode_reward_max=76.0,episode_reward_min=9.0,episode_reward_mean=24.74846625766871,episode_len_mean=24.74846625766871,episode_media={},episodes_this_iter=163,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.13999387460443846, 'mean_inference_ms': 1.8921175438316031, 'mean_action_processing_ms': 0.049409972891330194, 'mean_env_wait_ms': 0.11091243696397153, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=16000,timers={'sample_time_ms': 8118.676, 'sample_throughput': 492.691, 'learn_time_ms': 4490.377, 'learn_throughput': 890.794, 'update_time_ms': 3.079},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999998, 'cur_lr': 5.000000000000001e-05, 'total_loss': 0.0033017040404956788, 'policy_loss': -0.02532143609132618, 'vf_loss': 270

Trial PPO_StatelessCartPoleMD_65802_00000 reported episode_reward_max=97.0,episode_reward_min=9.0,episode_reward_mean=29.76865671641791,episode_len_mean=29.76865671641791,episode_media={},episodes_this_iter=134,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.13703803350515167, 'mean_inference_ms': 1.8877089351382854, 'mean_action_processing_ms': 0.04923409217000663, 'mean_env_wait_ms': 0.11046893977365568, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=20000,timers={'sample_time_ms': 8307.173, 'sample_throughput': 481.512, 'learn_time_ms': 4486.955, 'learn_throughput': 891.473, 'update_time_ms': 3.238},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': -0.008974620524906751, 'policy_loss': -0.051114389466855564, 'vf_loss': 395.1804012876568, 

Trial PPO_StatelessCartPoleMD_65802_00000 reported episode_reward_max=144.0,episode_reward_min=9.0,episode_reward_mean=40.75,episode_len_mean=40.75,episode_media={},episodes_this_iter=97,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.13471231496346509, 'mean_inference_ms': 1.8873239733723235, 'mean_action_processing_ms': 0.04920399700085691, 'mean_env_wait_ms': 0.11044715433832804, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=24000,timers={'sample_time_ms': 8441.331, 'sample_throughput': 473.859, 'learn_time_ms': 4449.882, 'learn_throughput': 898.9, 'update_time_ms': 3.064},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 0.04466253703629429, 'policy_loss': -0.020575117017848022, 'vf_loss': 630.1404970111269, 'vf_explained_var': -0.01965

Trial PPO_StatelessCartPoleMD_65802_00000 reported episode_reward_max=144.0,episode_reward_min=12.0,episode_reward_mean=48.11,episode_len_mean=48.11,episode_media={},episodes_this_iter=83,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.13299270319244258, 'mean_inference_ms': 1.8887730514155694, 'mean_action_processing_ms': 0.04922783111264435, 'mean_env_wait_ms': 0.11069827729459215, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=28000,timers={'sample_time_ms': 8527.438, 'sample_throughput': 469.074, 'learn_time_ms': 4424.18, 'learn_throughput': 904.122, 'update_time_ms': 2.942},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 0.02994333216638276, 'policy_loss': -0.029023381813683292, 'vf_loss': 568.6106243711529, 'vf_explained_var': -0.035

Trial PPO_StatelessCartPoleMD_65802_00000 reported episode_reward_max=154.0,episode_reward_min=12.0,episode_reward_mean=58.3,episode_len_mean=58.3,episode_media={},episodes_this_iter=60,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.13174432623531035, 'mean_inference_ms': 1.8937252752823808, 'mean_action_processing_ms': 0.04933836038555456, 'mean_env_wait_ms': 0.111220401813198, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=32000,timers={'sample_time_ms': 8599.616, 'sample_throughput': 465.137, 'learn_time_ms': 4407.203, 'learn_throughput': 907.605, 'update_time_ms': 2.851},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 0.04629507146098397, 'policy_loss': -0.028992511151414928, 'vf_loss': 729.1446302009351, 'vf_explained_var': -0.047029

Trial PPO_StatelessCartPoleMD_65802_00000 reported episode_reward_max=297.0,episode_reward_min=12.0,episode_reward_mean=73.41,episode_len_mean=73.41,episode_media={},episodes_this_iter=51,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.1304204839260088, 'mean_inference_ms': 1.8988378411288358, 'mean_action_processing_ms': 0.049493761600505476, 'mean_env_wait_ms': 0.11135842936240536, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=36000,timers={'sample_time_ms': 8637.324, 'sample_throughput': 463.106, 'learn_time_ms': 4398.633, 'learn_throughput': 909.374, 'update_time_ms': 2.773},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 0.07521230905937652, 'policy_loss': 5.750938346891692e-05, 'vf_loss': 732.8086897416548, 'vf_explained_var': -0.00

Trial PPO_StatelessCartPoleMD_65802_00000 reported episode_reward_max=297.0,episode_reward_min=12.0,episode_reward_mean=81.01,episode_len_mean=81.01,episode_media={},episodes_this_iter=48,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.1289778364205617, 'mean_inference_ms': 1.8984699932914149, 'mean_action_processing_ms': 0.04946774626594633, 'mean_env_wait_ms': 0.11132797302088634, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=40000,timers={'sample_time_ms': 8672.253, 'sample_throughput': 461.241, 'learn_time_ms': 4401.632, 'learn_throughput': 908.754, 'update_time_ms': 2.724},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 0.05586183291831703, 'policy_loss': -0.025013950122802546, 'vf_loss': 791.1370985551314, 'vf_explained_var': -0.082

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_StatelessCartPoleMD_65802_00000,TERMINATED,172.31.0.4:6081,10,90.7405,40000,81.01,297,12,81.01


[2m[36m(pid=6086)[0m 2021-11-17 09:51:56,655	ERROR worker.py:425 -- SystemExit was raised from the worker
[2m[36m(pid=6086)[0m Traceback (most recent call last):
[2m[36m(pid=6086)[0m   File "python/ray/_raylet.pyx", line 558, in ray._raylet.execute_task
[2m[36m(pid=6086)[0m   File "python/ray/_raylet.pyx", line 565, in ray._raylet.execute_task
[2m[36m(pid=6086)[0m   File "python/ray/_raylet.pyx", line 569, in ray._raylet.execute_task
[2m[36m(pid=6086)[0m   File "python/ray/_raylet.pyx", line 519, in ray._raylet.execute_task.function_executor
[2m[36m(pid=6086)[0m   File "/opt/conda/lib/python3.9/site-packages/ray/_private/function_manager.py", line 576, in actor_method_executor
[2m[36m(pid=6086)[0m     return method(__ray_actor, *args, **kwargs)
[2m[36m(pid=6086)[0m   File "/opt/conda/lib/python3.9/site-packages/ray/util/tracing/tracing_helper.py", line 451, in _resume_span
[2m[36m(pid=6086)[0m     return method(self, *_args, **_kwargs)
[2m[36m(pid=6086)

In [9]:
from ray.rllib.agents.ppo import PPOTrainer

checkpoints = results.get_trial_checkpoints_paths(
    trial=results.get_best_trial("episode_reward_mean", mode="max"),
    metric="episode_reward_mean")

checkpoint_path = checkpoints[0][0]
trainer = PPOTrainer(config)
trainer.restore(checkpoint_path)

# Inference loop.
env = StatelessCartPoleMD()
obs = env.reset()
# range(2) b/c h- and c-states of the LSTM.
lstm_cell_size = 256
init_state = state = [
        np.zeros([lstm_cell_size], np.float32) for _ in range(2)
]

# Run manual inference loop for n episodes.
for _ in range(10):
    episode_reward = 0
    reward = 0.0
    done = False
    obs = env.reset()
    state = init_state
    prev_a = [0, 0]
    prev_r = 0.0

    while not done:
        a, state_out, _ = trainer.compute_single_action(obs, state, prev_action=prev_a, prev_reward=prev_r)
        obs, reward, done, _ = env.step(a)
        episode_reward += reward
        prev_a = a
        prev_r = reward
        state = state_out

    print(f"Episode reward={episode_reward}")




[2m[36m(pid=6085)[0m   logger.warn(
[2m[36m(pid=6084)[0m   logger.warn(
2021-11-17 09:57:33,363	INFO trainable.py:416 -- Restored on 172.31.0.4 from checkpoint: /home/condauser/ray_results/PPO/PPO_StatelessCartPoleMD_65802_00000_0_2021-11-17_09-50-21/checkpoint_000010/checkpoint-10
2021-11-17 09:57:33,365	INFO trainable.py:424 -- Current state after restoring: {'_iteration': 10, '_timesteps_total': 0, '_time_total': 90.74048352241516, '_episodes_total': 1364}


IndexError: index 1 is out of bounds for dimension 1 with size 1

In [9]:
ray.shutdown()