In [1]:
from gym.spaces import Box
import numpy as np
import gym
from gym.spaces import Discrete, MultiDiscrete
from typing import Dict, List, Union

from gym.envs.classic_control import CartPoleEnv

In [2]:
class StatelessCartPole(CartPoleEnv):
    """Partially observable variant of the CartPole gym environment.

    https://github.com/openai/gym/blob/master/gym/envs/classic_control/
    cartpole.py

    We delete the x- and angular velocity components of the state, so that it
    can only be solved by a memory enhanced model (policy).
    """

    def __init__(self, config=None):
        super().__init__()

        # Fix our observation-space (remove 2 velocity components).
        high = np.array(
            [
                self.x_threshold * 2,
                self.theta_threshold_radians * 2,
            ],
            dtype=np.float32)

        self.observation_space = Box(low=-high, high=high, dtype=np.float32)

    def step(self, action):
        next_obs, reward, done, info = super().step(action)
        # next_obs is [x-pos, x-veloc, angle, angle-veloc]
        return np.array([next_obs[0], next_obs[2]]), reward, done, info

    def reset(self):
        init_obs = super().reset()
        # init_obs is [x-pos, x-veloc, angle, angle-veloc]
        return np.array([init_obs[0], init_obs[2]])

In [3]:
import ray
from ray import tune
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.models.torch.misc import SlimFC
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.policy.view_requirement import ViewRequirement
from ray.rllib.utils.framework import try_import_torch
from ray.rllib.utils.torch_utils import one_hot as torch_one_hot
from ray.rllib.utils.typing import ModelConfigDict, TensorType
from ray.tune.registry import register_env
from ray.rllib.models import ModelCatalog



In [4]:
torch, nn = try_import_torch()

In [5]:
class TorchFrameStackingCartPoleModel(TorchModelV2, nn.Module):
    """A simple FC model that takes the last n observations as input."""

    def __init__(self,
                 obs_space,
                 action_space,
                 num_outputs,
                 model_config,
                 name,
                 num_frames=3):
        nn.Module.__init__(self)
        super(TorchFrameStackingCartPoleModel, self).__init__(
            obs_space, action_space, None, model_config, name)

        self.num_frames = num_frames
        self.num_outputs = num_outputs

        # Construct actual (very simple) FC model.
        assert len(obs_space.shape) == 1
        in_size = self.num_frames * (obs_space.shape[0] + action_space.n + 1)
        self.layer1 = SlimFC(
            in_size=in_size, out_size=256, activation_fn="relu")
        self.layer2 = SlimFC(in_size=256, out_size=256, activation_fn="relu")
        self.out = SlimFC(
            in_size=256, out_size=self.num_outputs, activation_fn="linear")
        self.values = SlimFC(in_size=256, out_size=1, activation_fn="linear")

        self._last_value = None

        self.view_requirements["prev_n_obs"] = ViewRequirement(
            data_col="obs",
            shift="-{}:0".format(num_frames - 1),
            space=obs_space)
        self.view_requirements["prev_n_rewards"] = ViewRequirement(
            data_col="rewards", shift="-{}:-1".format(self.num_frames))
        self.view_requirements["prev_n_actions"] = ViewRequirement(
            data_col="actions",
            shift="-{}:-1".format(self.num_frames),
            space=self.action_space)

    def forward(self, input_dict, states, seq_lens):
        obs = input_dict["prev_n_obs"]
        obs = torch.reshape(obs, [-1, self.obs_space.shape[0] * self.num_frames])
        rewards = torch.reshape(input_dict["prev_n_rewards"],
                                [-1, self.num_frames])
        actions = torch_one_hot(input_dict["prev_n_actions"],
                                self.action_space)
        actions = torch.reshape(actions, [-1, self.num_frames * actions.shape[-1]])
        input_ = torch.cat([obs, actions, rewards], dim=-1)
        features = self.layer1(input_)
        features = self.layer2(features)
        out = self.out(features)
        self._last_value = self.values(features)
        return out, []

    def value_function(self):
        return torch.squeeze(self._last_value, -1)

In [6]:
from ray.rllib.agents.callbacks import DefaultCallbacks
from ray.rllib.evaluation import Episode, RolloutWorker
from ray.rllib.env import BaseEnv
from ray.rllib.policy import Policy

class MyCallbacks(DefaultCallbacks):
    def on_episode_step(self, *, worker: RolloutWorker, base_env: BaseEnv,
                        policies: Dict[str, Policy], episode: Episode,
                        env_index: int, **kwargs):
        # Make sure this episode is ongoing.
        assert episode.length > 0, \
            "ERROR: `on_episode_step()` callback should not be called right " \
            "after env reset!"
        print(episode.input_dict["prev_n_obs"])


In [7]:
ray.init(num_cpus=0 or None)



{'node_ip_address': '172.18.0.3',
 'raylet_ip_address': '172.18.0.3',
 'redis_address': '172.18.0.3:6379',
 'object_store_address': '/tmp/ray/session_2021-12-14_17-55-25_995835_32/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-12-14_17-55-25_995835_32/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2021-12-14_17-55-25_995835_32',
 'metrics_export_port': 51534,
 'node_id': '4cfa206f85296778205593c3a2a7826cd686ff59bc5a6bbd760a4418'}

In [8]:
ModelCatalog.register_custom_model("frame_stack_model", TorchFrameStackingCartPoleModel)
register_env("StatelessPendulum", lambda _: StatelessCartPole())


In [9]:
env = StatelessCartPole()

In [10]:
env.reset()

array([-0.01925686,  0.02920437], dtype=float32)

In [11]:
num_frames = 20
config = {
    "env": 'StatelessPendulum',
    "gamma": 0.9,
    "num_gpus": 0,
    "num_workers": 0,
    "num_envs_per_worker": 20,
    "callbacks": MyCallbacks,
    "entropy_coeff": 0.001,
    "num_sgd_iter": 5,
    "vf_loss_coeff": 1e-5,
    "model": {
        "vf_share_layers": True,
        "custom_model": "frame_stack_model",
        "custom_model_config": {
            "num_frames": num_frames,
        },
    },
    "framework": 'torch',
}


In [12]:
stop = {
    "training_iteration": 10,
    "timesteps_total": 100000,
    "episode_reward_mean": 300.,
}

In [13]:
results = tune.run('PPO', 
                   config=config, 
                   stop=stop, 
                   verbose=2,
                   checkpoint_at_end=True)

2021-12-14 17:55:43,243	INFO logger.py:605 -- pip install "ray[tune]" to see TensorBoard files.
2021-12-14 17:55:43,669	ERROR syncer.py:111 -- Log sync requires rsync to be installed.
[2m[36m(PPO pid=166)[0m 2021-12-14 17:55:45,419	INFO ppo.py:166 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(PPO pid=166)[0m 2021-12-14 17:55:45,419	INFO trainer.py:719 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


2021-12-14 17:55:46,887	ERROR trial_runner.py:958 -- Trial PPO_StatelessPendulum_acd02_00000: Error processing event.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/ray/tune/trial_runner.py", line 924, in _process_trial
    results = self.trial_executor.fetch_result(trial)
  File "/opt/conda/lib/python3.9/site-packages/ray/tune/ray_trial_executor.py", line 783, in fetch_result
    result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/ray/worker.py", line 1712, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(AttributeError): [36mray::PPO.train()[39m (pid=166, ip=172.18.0.3, repr=PPO)
  File "/opt/conda/lib/python3.9/site-packages/ray/tune/trainable.py", line 314, in train
    result = self.step()
  File "/opt/conda/lib/python3.9/site-pac

The trial PPO_StatelessPendulum_acd02_00000 errored with parameters={'env': 'StatelessPendulum', 'gamma': 0.9, 'num_gpus': 0, 'num_workers': 0, 'num_envs_per_worker': 20, 'callbacks': <class '__main__.MyCallbacks'>, 'entropy_coeff': 0.001, 'num_sgd_iter': 5, 'vf_loss_coeff': 1e-05, 'model': {'vf_share_layers': True, 'custom_model': 'frame_stack_model', 'custom_model_config': {'num_frames': 20}}, 'framework': 'torch'}. Error file: /home/condauser/ray_results/PPO/PPO_StatelessPendulum_acd02_00000_0_2021-12-14_17-55-43/error.txt


Trial name,status,loc
PPO_StatelessPendulum_acd02_00000,ERROR,172.18.0.3:166

Trial name,# failures,error file
PPO_StatelessPendulum_acd02_00000,1,/home/condauser/ray_results/PPO/PPO_StatelessPendulum_acd02_00000_0_2021-12-14_17-55-43/error.txt


TuneError: ('Trials did not complete', [PPO_StatelessPendulum_acd02_00000])

In [36]:
checkpoints = results.get_trial_checkpoints_paths(
    trial=results.get_best_trial("episode_reward_mean", mode="max"),
    metric="episode_reward_mean")

checkpoint_path = checkpoints[0][0]
trainer = PPOTrainer(config)
trainer.restore(checkpoint_path)

# Inference loop.
env = StatelessCartPole()

# Run manual inference loop for n episodes.
for _ in range(10):
    episode_reward = 0.0
    reward = 0.0
    action = 0
    done = False
    obs = env.reset()
    while not done:
        # Create a dummy action using the same observation n times,
        # as well as dummy prev-n-actions and prev-n-rewards.
        action, state, logits = trainer.compute_single_action(
            input_dict={
                "obs": obs,
                "prev_n_obs": np.stack([obs for _ in range(num_frames)]),
                "prev_n_actions": np.stack([0 for _ in range(num_frames)]),
                "prev_n_rewards": np.stack(
                    [1.0 for _ in range(num_frames)]),
            },
            full_fetch=True)
        obs, reward, done, info = env.step(action)
        episode_reward += reward

    print(f"Episode reward={episode_reward}")


2021-12-10 17:50:25,798	INFO trainable.py:467 -- Restored on 172.18.0.3 from checkpoint: /home/condauser/ray_results/PPO/PPO_StatelessPendulum_2fef5_00000_0_2021-12-10_17-49-48/checkpoint_000025/checkpoint-25
2021-12-10 17:50:25,800	INFO trainable.py:475 -- Current state after restoring: {'_iteration': 25, '_timesteps_total': 0, '_time_total': 33.76933407783508, '_episodes_total': 1487}


Episode reward=10.0
Episode reward=10.0
Episode reward=9.0
Episode reward=10.0
Episode reward=10.0
Episode reward=10.0
Episode reward=10.0
Episode reward=10.0
Episode reward=8.0
Episode reward=10.0


In [None]:
ray.shutdown()