In [1]:
import gymnasium as gym
import numpy as np
from typing import Any, Dict, List, Tuple, Optional
from ray.rllib.env.multi_agent_env import MultiAgentEnv



In [2]:
from ray.tune.registry import register_env
from ray.rllib.algorithms import ppo, dqn, appo, impala
from ray.rllib.policy.policy import PolicySpec, Policy
from ray.rllib.utils.annotations import override
from ray.rllib.examples.policy.random_policy import RandomPolicy
from rl.envs.othello import OthelloEnv



In [3]:
register_env("othello", lambda _: OthelloEnv({}))

In [4]:
import gymnasium as gym
action = gym.spaces.Box(-1, 1, shape=(1, 1), dtype=np.float32).sample()
type(action)

numpy.ndarray

In [5]:
from typing import List
import random


class OthelloRandomPolicy(RandomPolicy):
    @override(Policy)
    def compute_actions(self, obs_batch, **kwargs):
        actions: List[int] = [
            self._valid_random_action(obs) for obs in obs_batch
        ] 
        return actions, [], {}

    def _valid_random_action(self, obs) -> int:
        valid_actions: List[int] = []
        for row in range(obs.shape[0]):
            for col in range(obs.shape[1]):
                if self._is_valid_action(obs, row, col):
                    valid_actions.append(row * obs.shape[0] + col)

        return random.choice(valid_actions)

    def _is_valid_move(self, row: int, col: int) -> bool:
        if row < 0 or row > 7 or col < 0 or col > 7 or self.board[row][col] != 0:
            return False

        marks = {"agent_1": 1, "agent_2": -1, "empty": 0}
        agent_id = marks[self.__policy_id]
        directions = [
            (1, 0),
            (-1, 0),
            (0, 1),
            (0, -1),
            (1, 1),
            (1, -1),
            (-1, 1),
            (-1, -1),
        ]

        for d_row, d_col in directions:
            r, c = row + d_row, col + d_col
            if 0 <= r < 8 and 0 <= c < 8 and self.board[r][c] == -agent_id:
                r += d_row
                c += d_col
                while 0 <= r < 8 and 0 <= c < 8 and self.board[r][c] == -agent_id:
                    r += d_row
                    c += d_col
                if 0 <= r < 8 and 0 <= c < 8 and self.board[r][c] == agent_id:
                    return True

        return False

In [6]:
import os

def policy_mapping_fn(agent_id, episode, worker, **kwargs):
    agent_id = int(agent_id[-1]) - 1
    return "agent_1" if episode.episode_id % 2 == agent_id else "agent_2"

config = ppo.PPOConfig().environment("othello").framework("torch").rollouts(num_rollout_workers=os.cpu_count() // 2).resources(num_gpus=0)
config = config.multi_agent(policies={"agent_1": PolicySpec(), "agent_2": PolicySpec(policy_class=OthelloRandomPolicy)}, policy_mapping_fn=policy_mapping_fn, policies_to_train=["agent_1"])
config = config.training(model={"fcnet_hiddens": [512, 512]}, _enable_learner_api=False)
# config = config.training(num_sgd_iter=10, model={"conv_filters": [[32, [3, 3], 1], [64, [3, 3], 1]]})
# config = config.training(model={"conv_filters": [[32, [3, 3], 2], [64, [3, 3], 2]]}, _enable_learner_api=False)
# config = config.training(_enable_learner_api=False)
config = config.rl_module(_enable_rl_module_api=False)



In [7]:
# import random
# # multiagent environment for othello
# env = OthelloEnv({})
# obs, _ = env.reset()
# terminated = False
# current_player = "agent_1"
# while not terminated:
#     env.render()
#     # action = env.action_space.sample()
#     valid_actions = env.get_valid_moves(current_player)
#     print(current_player, obs.keys())
#     print(obs[current_player].shape)
#     action = algo.compute_single_action(obs[current_player], policy_id=current_player)
#     print(action)
#     if len(valid_actions) > 0:
#         action = random.choice(valid_actions)
#     else:
#         action = 64
#     action = {current_player: action}
#     obs, reward, terminated, truncated, _= env.step(action)
#     reward = reward[current_player]
#     terminated = terminated[current_player]
#     truncated = truncated[current_player]
#     current_player = env.current_player
#     print(terminated, truncated, reward)

In [8]:
from ray import air
from ray import tune

results = tune.Tuner(
    "PPO",
    param_space=config,
    run_config=air.RunConfig(
        checkpoint_config=air.CheckpointConfig(
            checkpoint_at_end=True,
            checkpoint_frequency=50,
        )
    ),
).fit()

0,1
Current time:,2023-08-14 19:26:07
Running for:,00:01:41.60
Memory:,7.4/7.9 GiB

Trial name,status,loc
PPO_othello_be570_00000,PENDING,




In [None]:
results

ResultGrid<[
  Result(
    error='TuneError',
    metrics={'trial_id': 'fef35_00000'},
    path='c://\\Users\\lcglab\\ray_results\\IMPALA\\IMPALA_othello_fef35_00000_0_2023-08-11_00-17-18',
    checkpoint=None
  )
]>

[2m[36m(Impala pid=21228)[0m 2023-08-11 00:17:38,050	ERROR actor_manager.py:500 -- Ray error, taking actor 1 out of service. The actor died because of an error raised in its creation task, [36mray::RolloutWorker.__init__()[39m (pid=16064, ip=127.0.0.1, actor_id=77c468a4531990bdcb4240c601000000, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x0000022313862700>)
[2m[36m(Impala pid=21228)[0m   File "python\ray\_raylet.pyx", line 1424, in ray._raylet.execute_task
[2m[36m(Impala pid=21228)[0m   File "python\ray\_raylet.pyx", line 1364, in ray._raylet.execute_task.function_executor
[2m[36m(Impala pid=21228)[0m   File "c:\Users\lcglab\miniconda3\envs\rl\lib\site-packages\ray\_private\function_manager.py", line 726, in actor_method_executor
[2m[36m(Impala pid=21228)[0m     return method(__ray_actor, *args, **kwargs)
[2m[36m(Impala pid=21228)[0m   File "c:\Users\lcglab\miniconda3\envs\rl\lib\site-packages\ray\util\tracing\tracing_helper.py", line 464, in 

In [None]:
checkpoint = results.get_best_result().checkpoint
print(checkpoint)

None


In [None]:
import ray
ray.shutdown()
register_env("othello", lambda _: OthelloEnv({}))

In [None]:
config.expolore = False
algo = config.build()
algo.restore(checkpoint)

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
2023-08-11 00:17:46,143	INFO worker.py:1621 -- Started a local Ray instance.
2023-08-11 00:17:59,702	INFO trainable.py:172 -- Trainable.setup took 18.396 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


TypeError: stat: path should be string, bytes, os.PathLike or integer, not NoneType

In [None]:
algo.get_policy("agent_1").export_model("othello_policy", onnx=18)

verbose: False, log level: Level.ERROR



In [None]:
import random
# multiagent environment for othello
env = OthelloEnv({})
obs, _ = env.reset()
terminated = False
current_player = "agent_1"
while not terminated:
    env.render()
    # action = env.action_space.sample()
    valid_actions = env.get_valid_moves(current_player)
    action = algo.compute_single_action(obs[current_player], policy_id=current_player)
    # if len(valid_actions) > 0:
    #     action = random.choice(valid_actions)
    # else:
    #     action = 64
    action = {current_player: action}
    print(action)
    obs, reward, terminated, truncated, _= env.step(action)
    reward = reward[current_player]
    terminated = terminated[current_player]
    truncated = truncated[current_player]
    current_player = env.current_player

.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|O|X|.|.|.
.|.|.|X|O|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.

{'agent_1': 29}
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|O|O|O|.|.
.|.|.|X|O|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.

{'agent_2': 37}
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|O|O|O|.|.
.|.|.|X|X|X|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.

{'agent_1': 19}


In [None]:
env.render()

.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|O|O|O|.|.
.|.|.|X|X|X|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.

