In [1]:
import os
import gymnasium as gym
import numpy as np

import ray
from ray.tune.registry import register_env
from ray.rllib.algorithms import ppo, dqn, appo, impala
from ray.rllib.policy.policy import PolicySpec



In [2]:
from rl.envs.othello import OthelloEnv
from rl.agents.random_policy import OthelloRandomPolicy

In [3]:
ray.init(include_dashboard=False)

2023-08-16 11:10:21,297	INFO worker.py:1621 -- Started a local Ray instance.


0,1
Python version:,3.8.17
Ray version:,2.6.2


In [4]:
register_env("othello", lambda _: OthelloEnv({}))

In [5]:
model_config = {
    "fcnet_hiddens": [512, 512],
}

def policy_mapping_fn(agent_id, episode, worker, **kwargs):
    agent_id = int(agent_id[-1]) - 1
    return "agent_1" if episode.episode_id % 2 == agent_id else "agent_2"

config = ppo.PPOConfig().environment("othello").framework("torch").rollouts(num_rollout_workers=os.cpu_count() // 2).resources(num_gpus=0)
config = config.multi_agent(policies={"agent_1": PolicySpec(), "agent_2": PolicySpec(policy_class=OthelloRandomPolicy)}, policy_mapping_fn=policy_mapping_fn, policies_to_train=["agent_1"])
config = config.training(model=model_config, _enable_learner_api=False)
config = config.rl_module(_enable_rl_module_api=False)
# config = config.training(num_sgd_iter=10, model={"conv_filters": [[32, [3, 3], 1], [64, [3, 3], 1]]})
# config = config.training(model={"conv_filters": [[32, [3, 3], 2], [64, [3, 3], 2]]}, _enable_learner_api=False)



In [6]:
# import random
# # multiagent environment for othello
# env = OthelloEnv({})
# obs, _ = env.reset()
# terminated = False
# current_player = "agent_1"
# while not terminated:
#     env.render()
#     # action = env.action_space.sample()
#     valid_actions = env.get_valid_moves(current_player)
#     print(current_player, obs.keys())
#     print(obs[current_player].shape)
#     action = algo.compute_single_action(obs[current_player], policy_id=current_player)
#     print(action)
#     if len(valid_actions) > 0:
#         action = random.choice(valid_actions)
#     else:
#         action = 64
#     action = {current_player: action}
#     obs, reward, terminated, truncated, _= env.step(action)
#     reward = reward[current_player]
#     terminated = terminated[current_player]
#     truncated = truncated[current_player]
#     current_player = env.current_player
#     print(terminated, truncated, reward)

In [7]:
from ray import air
from ray import tune

results = tune.Tuner(
    "PPO",
    param_space=config,
    run_config=air.RunConfig(
        checkpoint_config=air.CheckpointConfig(
            checkpoint_at_end=True,
            checkpoint_frequency=50,
        )
    ),
).fit()

0,1
Current time:,2023-08-16 11:14:56
Running for:,00:04:33.03
Memory:,11.1/15.9 GiB

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_othello_0ed33_00000,RUNNING,127.0.0.1:24968,26,255.127,104000,-23.9519,15,-75,4.27487


[2m[36m(PPO pid=24968)[0m Install gputil for GPU system monitoring.


In [None]:
results

[2m[36m(PPO pid=24764)[0m 2023-08-16 10:33:36,334	ERROR actor_manager.py:500 -- Ray error, taking actor 1 out of service. The actor died because of an error raised in its creation task, [36mray::RolloutWorker.__init__()[39m (pid=26568, ip=127.0.0.1, actor_id=5c67fffa69c39dc7209aaf9b01000000, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x0000027092C78B50>)
[2m[36m(PPO pid=24764)[0m   File "c:\Users\nYoshiaki\Documents\rl-mini-app\rl\envs\othello.py", line 64, in step
[2m[36m(PPO pid=24764)[0m     action_dict[self.current_player], terminated, truncated
[2m[36m(PPO pid=24764)[0m KeyError: 'agent_1'
[2m[36m(PPO pid=24764)[0m 
[2m[36m(PPO pid=24764)[0m The above exception was the direct cause of the following exception:
[2m[36m(PPO pid=24764)[0m 
[2m[36m(PPO pid=24764)[0m [36mray::RolloutWorker.__init__()[39m (pid=26568, ip=127.0.0.1, actor_id=5c67fffa69c39dc7209aaf9b01000000, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at

ResultGrid<[
  Result(
    error='TuneError',
    metrics={'trial_id': 'e596e_00000'},
    path='c://\\Users\\nYoshiaki\\ray_results\\PPO\\PPO_othello_e596e_00000_0_2023-08-16_10-33-26',
    checkpoint=None
  )
]>

[2m[36m(RolloutWorker pid=19464)[0m Exception raised in creation task: The actor died because of an error raised in its creation task, [36mray::RolloutWorker.__init__()[39m (pid=19464, ip=127.0.0.1, actor_id=1488ad99d47a08275b648a1801000000, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x000001F445E38C10>)
[2m[36m(RolloutWorker pid=19464)[0m   File "c:\Users\nYoshiaki\Documents\rl-mini-app\rl\envs\othello.py", line 64, in step
[2m[36m(RolloutWorker pid=19464)[0m     action_dict[self.current_player], terminated, truncated
[2m[36m(RolloutWorker pid=19464)[0m KeyError: 'agent_1'
[2m[36m(RolloutWorker pid=19464)[0m 
[2m[36m(RolloutWorker pid=19464)[0m The above exception was the direct cause of the following exception:
[2m[36m(RolloutWorker pid=19464)[0m 
[2m[36m(RolloutWorker pid=19464)[0m [36mray::RolloutWorker.__init__()[39m (pid=19464, ip=127.0.0.1, actor_id=1488ad99d47a08275b648a1801000000, repr=<ray.rllib.evaluation.rollout_worker.Roll

In [None]:
checkpoint = results.get_best_result().checkpoint
print(checkpoint)

None


In [None]:
import ray
ray.shutdown()
register_env("othello", lambda _: OthelloEnv({}))

In [None]:
config.expolore = False
algo = config.build()
algo.restore(checkpoint)

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
2023-08-16 10:33:41,849	INFO worker.py:1621 -- Started a local Ray instance.
2023-08-16 10:33:48,544	ERROR actor_manager.py:500 -- Ray error, taking actor 1 out of service. The actor died because of an error raised in its creation task, [36mray::RolloutWorker.__init__()[39m (pid=10472, ip=127.0.0.1, 

ValueError: Traceback (most recent call last):
  File "c:\Users\nYoshiaki\AppData\Local\miniconda3\envs\rl\lib\site-packages\ray\rllib\utils\pre_checks\env.py", line 363, in check_multiagent_environments
    results = env.step(sampled_action)
  File "c:\Users\nYoshiaki\Documents\rl-mini-app\rl\envs\othello.py", line 64, in step
    action_dict[self.current_player], terminated, truncated
KeyError: 'agent_1'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\nYoshiaki\AppData\Local\miniconda3\envs\rl\lib\site-packages\ray\rllib\utils\pre_checks\env.py", line 81, in check_env
    check_multiagent_environments(env)
  File "c:\Users\nYoshiaki\AppData\Local\miniconda3\envs\rl\lib\site-packages\ray\rllib\utils\pre_checks\env.py", line 368, in check_multiagent_environments
    raise ValueError(
ValueError: Your environment (<OthelloEnv instance>) does not abide to the new gymnasium-style API!
From Ray 2.3 on, RLlib only supports the new (gym>=0.26 or gymnasium) Env APIs.
In particular, the `step()` method seems to be faulty.
Learn more about the most important changes here:
https://github.com/openai/gym and here: https://github.com/Farama-Foundation/Gymnasium

In order to fix this problem, do the following:

1) Run `pip install gymnasium` on your command line.
2) Change all your import statements in your code from
   `import gym` -> `import gymnasium as gym` OR
   `from gym.space import Discrete` -> `from gymnasium.spaces import Discrete`

For your custom (single agent) gym.Env classes:
3.1) Either wrap your old Env class via the provided `from gymnasium.wrappers import
     EnvCompatibility` wrapper class.
3.2) Alternatively to 3.1:
 - Change your `reset()` method to have the call signature 'def reset(self, *,
   seed=None, options=None)'
 - Return an additional info dict (empty dict should be fine) from your `reset()`
   method.
 - Return an additional `truncated` flag from your `step()` method (between `done` and
   `info`). This flag should indicate, whether the episode was terminated prematurely
   due to some time constraint or other kind of horizon setting.

For your custom RLlib `MultiAgentEnv` classes:
4.1) Either wrap your old MultiAgentEnv via the provided
     `from ray.rllib.env.wrappers.multi_agent_env_compatibility import
     MultiAgentEnvCompatibility` wrapper class.
4.2) Alternatively to 4.1:
 - Change your `reset()` method to have the call signature
   'def reset(self, *, seed=None, options=None)'
 - Return an additional per-agent info dict (empty dict should be fine) from your
   `reset()` method.
 - Rename `dones` into `terminateds` and only set this to True, if the episode is really
   done (as opposed to has been terminated prematurely due to some horizon/time-limit
   setting).
 - Return an additional `truncateds` per-agent dictionary flag from your `step()`
   method, including the `__all__` key (100% analogous to your `dones/terminateds`
   per-agent dict).
   Return this new `truncateds` dict between `dones/terminateds` and `infos`. This
   flag should indicate, whether the episode (for some agent or all agents) was
   terminated prematurely due to some time constraint or other kind of horizon setting.


The above error has been found in your environment! We've added a module for checking your custom environments. It may cause your experiment to fail if your environment is not set up correctly. You can disable this behavior via calling `config.environment(disable_env_checking=True)`. You can run the environment checking module standalone by calling ray.rllib.utils.check_env([your env]).

In [None]:
algo.get_policy("agent_1").export_model("othello_policy", onnx=18)

verbose: False, log level: Level.ERROR



In [None]:
import random
# multiagent environment for othello
env = OthelloEnv({})
obs, _ = env.reset()
terminated = False
current_player = "agent_1"
while not terminated:
    env.render()
    # action = env.action_space.sample()
    valid_actions = env.get_valid_moves(current_player)
    action = algo.compute_single_action(obs[current_player], policy_id=current_player)
    # if len(valid_actions) > 0:
    #     action = random.choice(valid_actions)
    # else:
    #     action = 64
    action = {current_player: action}
    print(action)
    obs, reward, terminated, truncated, _= env.step(action)
    reward = reward[current_player]
    terminated = terminated[current_player]
    truncated = truncated[current_player]
    current_player = env.current_player

.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|O|X|.|.|.
.|.|.|X|O|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.

{'agent_1': 29}
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|O|O|O|.|.
.|.|.|X|O|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.



TypeError: object of type 'int' has no len()

In [None]:
env.render()

.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|O|O|O|.|.
.|.|.|X|X|X|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.

