In [1]:
import gym
import math
import numpy as np
from gym import spaces
from gym.utils import seeding

In [2]:
class StatelessCartPole(gym.Env):
    """Partially observable variant of the CartPole gym environment.

    https://github.com/openai/gym/blob/master/gym/envs/classic_control/
    cartpole.py

    We delete the velocity component of the state, so that it can only be
    solved by a LSTM policy.
    """

    metadata = {
        "render.modes": ["human", "rgb_array"],
        "video.frames_per_second": 60
    }

    def __init__(self, config=None):
        self.gravity = 9.8
        self.masscart = 1.0
        self.masspole = 0.1
        self.total_mass = (self.masspole + self.masscart)
        self.length = 0.5  # actually half the pole's length
        self.polemass_length = (self.masspole * self.length)
        self.force_mag = 10.0
        self.tau = 0.02  # seconds between state updates

        # Angle at which to fail the episode
        self.theta_threshold_radians = 12 * 2 * math.pi / 360
        self.x_threshold = 2.4

        high = np.array([
            self.x_threshold * 2,
            self.theta_threshold_radians * 2,
        ])

        self.action_space = spaces.Discrete(2)
        self.observation_space = spaces.Box(-high, high)

        self.seed()
        self.viewer = None
        self.state = None

        self.steps_beyond_done = None

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, action):
        assert self.action_space.contains(
            action), "%r (%s) invalid" % (action, type(action))
        state = self.state
        x, x_dot, theta, theta_dot = state
        force = self.force_mag if action == 1 else -self.force_mag
        costheta = math.cos(theta)
        sintheta = math.sin(theta)
        temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta
                ) / self.total_mass
        thetaacc = (self.gravity * sintheta - costheta * temp) / (
            self.length *
            (4.0 / 3.0 - self.masspole * costheta * costheta / self.total_mass)
        )
        xacc = (temp -
                self.polemass_length * thetaacc * costheta / self.total_mass)
        x = x + self.tau * x_dot
        x_dot = x_dot + self.tau * xacc
        theta = theta + self.tau * theta_dot
        theta_dot = theta_dot + self.tau * thetaacc
        self.state = (x, x_dot, theta, theta_dot)
        done = (x < -self.x_threshold or x > self.x_threshold
                or theta < -self.theta_threshold_radians
                or theta > self.theta_threshold_radians)
        done = bool(done)

        if not done:
            reward = 1.0
        elif self.steps_beyond_done is None:
            # Pole just fell!
            self.steps_beyond_done = 0
            reward = 1.0
        else:
            self.steps_beyond_done += 1
            reward = 0.0

        rv = np.r_[self.state[0], self.state[2]]
        return rv, reward, done, {}

    def reset(self):
        self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4, ))
        self.steps_beyond_done = None

        rv = np.r_[self.state[0], self.state[2]]
        return rv

    def render(self, mode="human"):
        screen_width = 600
        screen_height = 400

        world_width = self.x_threshold * 2
        scale = screen_width / world_width
        carty = 100  # TOP OF CART
        polewidth = 10.0
        polelen = scale * 1.0
        cartwidth = 50.0
        cartheight = 30.0

        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.Viewer(screen_width, screen_height)
            l, r, t, b = (-cartwidth / 2, cartwidth / 2, cartheight / 2,
                          -cartheight / 2)
            axleoffset = cartheight / 4.0
            cart = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
            self.carttrans = rendering.Transform()
            cart.add_attr(self.carttrans)
            self.viewer.add_geom(cart)
            l, r, t, b = (-polewidth / 2, polewidth / 2,
                          polelen - polewidth / 2, -polewidth / 2)
            pole = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
            pole.set_color(.8, .6, .4)
            self.poletrans = rendering.Transform(translation=(0, axleoffset))
            pole.add_attr(self.poletrans)
            pole.add_attr(self.carttrans)
            self.viewer.add_geom(pole)
            self.axle = rendering.make_circle(polewidth / 2)
            self.axle.add_attr(self.poletrans)
            self.axle.add_attr(self.carttrans)
            self.axle.set_color(.5, .5, .8)
            self.viewer.add_geom(self.axle)
            self.track = rendering.Line((0, carty), (screen_width, carty))
            self.track.set_color(0, 0, 0)
            self.viewer.add_geom(self.track)

        if self.state is None:
            return None

        x = self.state
        cartx = x[0] * scale + screen_width / 2.0  # MIDDLE OF CART
        self.carttrans.set_translation(cartx, carty)
        self.poletrans.set_rotation(-x[2])

        return self.viewer.render(return_rgb_array=mode == "rgb_array")

    def close(self):
        if self.viewer:
            self.viewer.close()


In [3]:
import argparse
import os
from ray.rllib.utils.test_utils import check_learning_achieved

In [4]:
parser = argparse.ArgumentParser()
parser.add_argument(
    "--run",
    type=str,
    default="PPO",
    help="The RLlib-registered algorithm to use.")
parser.add_argument("--num-cpus", type=int, default=0)
parser.add_argument(
    "--framework",
    choices=["tf", "tf2", "tfe", "torch"],
    default="torch",
    help="The DL framework specifier.")
parser.add_argument("--eager-tracing", action="store_true")
parser.add_argument("--use-prev-action", action="store_true")
parser.add_argument("--use-prev-reward", action="store_true")
parser.add_argument(
    "--as-test",
    action="store_true",
    help="Whether this script should be run as a test: --stop-reward must "
    "be achieved within --stop-timesteps AND --stop-iters.")
parser.add_argument(
    "--stop-iters",
    type=int,
    default=200,
    help="Number of iterations to train.")
parser.add_argument(
    "--stop-timesteps",
    type=int,
    default=100000,
    help="Number of timesteps to train.")
parser.add_argument(
    "--stop-reward",
    type=float,
    default=150.0,
    help="Reward at which we stop training.")

_StoreAction(option_strings=['--stop-reward'], dest='stop_reward', nargs=None, const=None, default=150.0, type=<class 'float'>, choices=None, help='Reward at which we stop training.', metavar=None)

In [5]:
import ray
from ray import tune

args = parser.parse_args("--stop-iters=10 --use-prev-action --use-prev-reward".split())

ray.init(num_cpus=args.num_cpus or None)

configs = {
    "PPO": {
        "num_sgd_iter": 5,
        "sgd_minibatch_size": 128,
        "simple_optimizer": True,        
        "model": {
            "vf_share_layers": True,
        },
        "vf_loss_coeff": 0.0001,
    },
    "IMPALA": {
        "num_workers": 2,
        "num_gpus": 0,
        "vf_loss_coeff": 0.01,
    },
}

config = dict(
    configs[args.run],
    **{
        "env": StatelessCartPole,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "model": {
            "use_lstm": True,
            "lstm_cell_size": 256,
            "lstm_use_prev_action": args.use_prev_action,
            "lstm_use_prev_reward": args.use_prev_reward,
        },
        "framework": args.framework,
        # Run with tracing enabled for tfe/tf2?
        "eager_tracing": args.eager_tracing,
    })

stop = {
    "training_iteration": args.stop_iters,
    "timesteps_total": args.stop_timesteps,
    "episode_reward_mean": args.stop_reward,
}

# To run the Trainer without tune.run, using our LSTM model and
# manual state-in handling, do the following:

# Example (use `config` from the above code):
# >> import numpy as np
# >> from ray.rllib.agents.ppo import PPOTrainer
# >>
# >> trainer = PPOTrainer(config)
# >> lstm_cell_size = config["model"]["lstm_cell_size"]
# >> env = StatelessCartPole()
# >> obs = env.reset()
# >>
# >> # range(2) b/c h- and c-states of the LSTM.
# >> init_state = state = [
# ..     np.zeros([lstm_cell_size], np.float32) for _ in range(2)
# .. ]
# >> prev_a = 0
# >> prev_r = 0.0
# >>
# >> while True:
# >>     a, state_out, _ = trainer.compute_single_action(
# ..         obs, state, prev_a, prev_r)
# >>     obs, reward, done, _ = env.step(a)
# >>     if done:
# >>         obs = env.reset()
# >>         state = init_state
# >>         prev_a = 0
# >>         prev_r = 0.0
# >>     else:
# >>         state = state_out
# >>         prev_a = a
# >>         prev_r = reward



In [6]:
config

{'num_sgd_iter': 5,
 'sgd_minibatch_size': 128,
 'simple_optimizer': True,
 'model': {'use_lstm': True,
  'lstm_cell_size': 256,
  'lstm_use_prev_action': True,
  'lstm_use_prev_reward': True},
 'vf_loss_coeff': 0.0001,
 'env': __main__.StatelessCartPole,
 'num_gpus': 0,
 'framework': 'torch',
 'eager_tracing': False}

In [7]:
results = tune.run(args.run, config=config, stop=stop, verbose=2, checkpoint_at_end=True)

2021-11-17 09:54:48,504	ERROR syncer.py:75 -- Log sync requires rsync to be installed.
[2m[36m(pid=6926)[0m 2021-11-17 09:54:50,189	INFO trainer.py:770 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=6916)[0m   logger.warn(
[2m[36m(pid=6920)[0m   logger.warn(




Trial PPO_StatelessCartPole_04a94_00000 reported episode_reward_max=74.0,episode_reward_min=9.0,episode_reward_mean=21.785714285714285,episode_len_mean=21.785714285714285,episode_media={},episodes_this_iter=182,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.14105000705326087, 'mean_inference_ms': 1.6528654878614037, 'mean_action_processing_ms': 0.055174711411258445, 'mean_env_wait_ms': 0.10674619796974744, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=4000,timers={'sample_time_ms': 4372.744, 'sample_throughput': 914.758, 'learn_time_ms': 4296.595, 'learn_throughput': 930.97, 'update_time_ms': 2.427},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999998, 'cur_lr': 5.000000000000001e-05, 'total_loss': -0.013530209939926863, 'policy_loss': -0.03782545167487115, 'vf_loss': 219.4

Trial PPO_StatelessCartPole_04a94_00000 reported episode_reward_max=85.0,episode_reward_min=11.0,episode_reward_mean=28.905797101449274,episode_len_mean=28.905797101449274,episode_media={},episodes_this_iter=138,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.13440105482905731, 'mean_inference_ms': 1.6066227094678602, 'mean_action_processing_ms': 0.05308624149352066, 'mean_env_wait_ms': 0.10312974283055269, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=8000,timers={'sample_time_ms': 6278.027, 'sample_throughput': 637.143, 'learn_time_ms': 4285.265, 'learn_throughput': 933.431, 'update_time_ms': 2.556},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 0.019632059053489657, 'policy_loss': -0.02342396839098497, 'vf_loss': 413.04342286081027, '

Trial PPO_StatelessCartPole_04a94_00000 reported episode_reward_max=119.0,episode_reward_min=11.0,episode_reward_mean=42.75,episode_len_mean=42.75,episode_media={},episodes_this_iter=89,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.1334640650507028, 'mean_inference_ms': 1.6383927345005065, 'mean_action_processing_ms': 0.05403298747732385, 'mean_env_wait_ms': 0.10464187628147618, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=12000,timers={'sample_time_ms': 7064.135, 'sample_throughput': 566.241, 'learn_time_ms': 4323.192, 'learn_throughput': 925.242, 'update_time_ms': 2.413},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 0.03469155574443214, 'policy_loss': -0.03375715291494447, 'vf_loss': 668.4830734715317, 'vf_explained_var': -0.003634

Trial PPO_StatelessCartPole_04a94_00000 reported episode_reward_max=148.0,episode_reward_min=12.0,episode_reward_mean=56.04,episode_len_mean=56.04,episode_media={},episodes_this_iter=68,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.1317059025188922, 'mean_inference_ms': 1.6450536233929083, 'mean_action_processing_ms': 0.05403977787226388, 'mean_env_wait_ms': 0.10505896508070886, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=16000,timers={'sample_time_ms': 7450.929, 'sample_throughput': 536.846, 'learn_time_ms': 4332.953, 'learn_throughput': 923.158, 'update_time_ms': 2.392},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 0.039460188167339025, 'policy_loss': -0.05475125294743162, 'vf_loss': 925.6147934422348, 'vf_explained_var': -0.02690

Trial PPO_StatelessCartPole_04a94_00000 reported episode_reward_max=199.0,episode_reward_min=12.0,episode_reward_mean=69.0,episode_len_mean=69.0,episode_media={},episodes_this_iter=46,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.1302693939858247, 'mean_inference_ms': 1.6445540120709592, 'mean_action_processing_ms': 0.054189910292762614, 'mean_env_wait_ms': 0.10500728520623709, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=20000,timers={'sample_time_ms': 7656.826, 'sample_throughput': 522.41, 'learn_time_ms': 4328.171, 'learn_throughput': 924.178, 'update_time_ms': 2.403},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 0.07950396437310811, 'policy_loss': -0.03738198415799574, 'vf_loss': 1145.5631712942413, 'vf_explained_var': -0.1179043

Trial PPO_StatelessCartPole_04a94_00000 reported episode_reward_max=336.0,episode_reward_min=12.0,episode_reward_mean=65.33,episode_len_mean=65.33,episode_media={},episodes_this_iter=72,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.1290892645727551, 'mean_inference_ms': 1.65357717002127, 'mean_action_processing_ms': 0.05448484886964919, 'mean_env_wait_ms': 0.1053533642517326, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=24000,timers={'sample_time_ms': 7835.183, 'sample_throughput': 510.518, 'learn_time_ms': 4352.036, 'learn_throughput': 919.11, 'update_time_ms': 2.356},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 0.004171868046802102, 'policy_loss': -0.052059624863393376, 'vf_loss': 530.9887540875059, 'vf_explained_var': -0.28401896

Trial PPO_StatelessCartPole_04a94_00000 reported episode_reward_max=336.0,episode_reward_min=16.0,episode_reward_mean=74.72,episode_len_mean=74.72,episode_media={},episodes_this_iter=43,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.1284386966710144, 'mean_inference_ms': 1.6555806141618914, 'mean_action_processing_ms': 0.05453696507317079, 'mean_env_wait_ms': 0.10542598843986205, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=28000,timers={'sample_time_ms': 7948.928, 'sample_throughput': 503.213, 'learn_time_ms': 4337.746, 'learn_throughput': 922.138, 'update_time_ms': 2.357},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 0.0437585218273329, 'policy_loss': -0.0440175752635255, 'vf_loss': 855.2423182631984, 'vf_explained_var': -0.15156361

Trial PPO_StatelessCartPole_04a94_00000 reported episode_reward_max=298.0,episode_reward_min=14.0,episode_reward_mean=92.42,episode_len_mean=92.42,episode_media={},episodes_this_iter=38,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.12741076972783394, 'mean_inference_ms': 1.6530452251922276, 'mean_action_processing_ms': 0.05443674477484267, 'mean_env_wait_ms': 0.10531209295626351, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=32000,timers={'sample_time_ms': 8000.448, 'sample_throughput': 499.972, 'learn_time_ms': 4323.071, 'learn_throughput': 925.268, 'update_time_ms': 2.333},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 0.05420476183521025, 'policy_loss': -0.02725828059130546, 'vf_loss': 799.5932944557884, 'vf_explained_var': -0.13317

Trial PPO_StatelessCartPole_04a94_00000 reported episode_reward_max=298.0,episode_reward_min=14.0,episode_reward_mean=107.39,episode_len_mean=107.39,episode_media={},episodes_this_iter=29,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.12659400104925278, 'mean_inference_ms': 1.651657456045081, 'mean_action_processing_ms': 0.05444165264284477, 'mean_env_wait_ms': 0.10529071374671667, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=36000,timers={'sample_time_ms': 8039.136, 'sample_throughput': 497.566, 'learn_time_ms': 4321.802, 'learn_throughput': 925.54, 'update_time_ms': 2.309},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 0.07291675565594977, 'policy_loss': -0.028507097409755894, 'vf_loss': 1002.4483799789891, 'vf_explained_var': -0.091

Trial PPO_StatelessCartPole_04a94_00000 reported episode_reward_max=311.0,episode_reward_min=14.0,episode_reward_mean=126.61,episode_len_mean=126.61,episode_media={},episodes_this_iter=24,policy_reward_min={},policy_reward_max={},policy_reward_mean={},custom_metrics={},sampler_perf={'mean_raw_obs_processing_ms': 0.1258227305048867, 'mean_inference_ms': 1.6504020957778869, 'mean_action_processing_ms': 0.05438220198486384, 'mean_env_wait_ms': 0.10526007608929788, 'mean_env_render_ms': 0.0},off_policy_estimator={},num_healthy_workers=2,timesteps_this_iter=0,agent_timesteps_total=40000,timers={'sample_time_ms': 8080.301, 'sample_throughput': 495.031, 'learn_time_ms': 4334.94, 'learn_throughput': 922.735, 'update_time_ms': 2.288},info={'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.19999999999999996, 'cur_lr': 5e-05, 'total_loss': 0.07479058060082881, 'policy_loss': -0.02539817564846838, 'vf_loss': 983.9349574369543, 'vf_explained_var': -0.10829

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_StatelessCartPole_04a94_00000,TERMINATED,172.31.0.4:6926,10,84.8593,40000,126.61,311,14,126.61


[2m[36m(pid=6916)[0m 2021-11-17 09:56:17,537	ERROR worker.py:425 -- SystemExit was raised from the worker
[2m[36m(pid=6916)[0m Traceback (most recent call last):
[2m[36m(pid=6916)[0m   File "python/ray/_raylet.pyx", line 558, in ray._raylet.execute_task
[2m[36m(pid=6916)[0m   File "python/ray/_raylet.pyx", line 565, in ray._raylet.execute_task
[2m[36m(pid=6916)[0m   File "python/ray/_raylet.pyx", line 569, in ray._raylet.execute_task
[2m[36m(pid=6916)[0m   File "python/ray/_raylet.pyx", line 519, in ray._raylet.execute_task.function_executor
[2m[36m(pid=6916)[0m   File "/opt/conda/lib/python3.9/site-packages/ray/_private/function_manager.py", line 576, in actor_method_executor
[2m[36m(pid=6916)[0m     return method(__ray_actor, *args, **kwargs)
[2m[36m(pid=6916)[0m   File "/opt/conda/lib/python3.9/site-packages/ray/util/tracing/tracing_helper.py", line 451, in _resume_span
[2m[36m(pid=6916)[0m     return method(self, *_args, **_kwargs)
[2m[36m(pid=6916)

In [8]:
from ray.rllib.agents.ppo import PPOTrainer

checkpoints = results.get_trial_checkpoints_paths(
    trial=results.get_best_trial("episode_reward_mean", mode="max"),
    metric="episode_reward_mean")

checkpoint_path = checkpoints[0][0]
trainer = PPOTrainer(config)
trainer.restore(checkpoint_path)

# Inference loop.
env = StatelessCartPole()
obs = env.reset()
# range(2) b/c h- and c-states of the LSTM.
lstm_cell_size = 256
init_state = state = [
        np.zeros([lstm_cell_size], np.float32) for _ in range(2)
]

# Run manual inference loop for n episodes.
for _ in range(10):
    episode_reward = 0
    reward = 0.0
    done = False
    obs = env.reset()
    state = init_state
    prev_a = 1
    prev_r = 0.0

    while not done:
        a, state_out, _ = trainer.compute_single_action(obs, state, prev_action=prev_a, prev_reward=prev_r)
        obs, reward, done, _ = env.step(a)
        episode_reward += reward
        prev_a = a
        prev_r = reward
        state = state_out

    print(f"Episode reward={episode_reward}")




2021-11-17 09:56:44,812	INFO trainer.py:770 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=6921)[0m   logger.warn(
[2m[36m(pid=6923)[0m   logger.warn(
2021-11-17 09:56:46,767	INFO trainable.py:416 -- Restored on 172.31.0.4 from checkpoint: /home/condauser/ray_results/PPO/PPO_StatelessCartPole_04a94_00000_0_2021-11-17_09-54-48/checkpoint_000010/checkpoint-10
2021-11-17 09:56:46,769	INFO trainable.py:424 -- Current state after restoring: {'_iteration': 10, '_timesteps_total': 0, '_time_total': 84.85933876037598, '_episodes_total': 729}
  logger.warn(


Episode reward=128.0
Episode reward=63.0
Episode reward=77.0
Episode reward=103.0
Episode reward=50.0
Episode reward=28.0
Episode reward=214.0
Episode reward=128.0
Episode reward=190.0
Episode reward=115.0


In [None]:
ray.shutdown()