In [1]:
import os
import gymnasium as gym
import numpy as np

import ray
from ray.tune import CLIReporter, register_env
from ray.rllib.algorithms import ppo, dqn, appo, impala
from ray.rllib.policy.policy import PolicySpec



In [2]:
from rl.envs.othello import OthelloEnv
from rl.agents.random_policy import OthelloRandomPolicy

In [3]:
ray.init(include_dashboard=False)

2023-08-21 13:44:17,721	INFO worker.py:1621 -- Started a local Ray instance.


0,1
Python version:,3.8.17
Ray version:,2.6.2


In [4]:
register_env("othello", lambda _: OthelloEnv({}))

In [5]:
model_config = {
    "fcnet_hiddens": [512, 512, 512],
}

def policy_mapping_fn(agent_id, episode, worker, **kwargs):
    agent_id = int(agent_id[-1]) - 1
    return "agent_1" if episode.episode_id % 2 == agent_id else "agent_2"

config = ppo.PPOConfig().environment("othello").framework("torch").rollouts(num_rollout_workers=int(os.cpu_count() * 0.7)).resources(num_gpus=1)
config = config.multi_agent(policies={"agent_1": PolicySpec(), "agent_2": PolicySpec(policy_class=OthelloRandomPolicy)}, policy_mapping_fn=policy_mapping_fn, policies_to_train=["agent_1"])
# config = config.multi_agent(policies={"agent_1": PolicySpec(), "agent_2": PolicySpec()}, policy_mapping_fn=policy_mapping_fn, policies_to_train=["agent_1", "agent_2"])
config = config.training(model=model_config, _enable_learner_api=False)
config = config.rl_module(_enable_rl_module_api=False)
# config = config.training(num_sgd_iter=10, model={"conv_filters": [[32, [3, 3], 1], [64, [3, 3], 1]]})
# config = config.training(model={"conv_filters": [[32, [3, 3], 2], [64, [3, 3], 2]]}, _enable_learner_api=False)



In [6]:
# import random
# # multiagent environment for othello
# env = OthelloEnv({})
# obs, _ = env.reset()
# terminated = False
# current_player = "agent_1"
# while not terminated:
#     env.render()
#     # action = env.action_space.sample()
#     valid_actions = env.get_valid_moves(current_player)
#     print(current_player, obs.keys())
#     print(obs[current_player].shape)
#     action = algo.compute_single_action(obs[current_player], policy_id=current_player)
#     print(action)
#     if len(valid_actions) > 0:
#         action = random.choice(valid_actions)
#     else:
#         action = 64
#     action = {current_player: action}
#     obs, reward, terminated, truncated, _= env.step(action)
#     reward = reward[current_player]
#     terminated = terminated[current_player]
#     truncated = truncated[current_player]
#     current_player = env.current_player
#     print(terminated, truncated, reward)

In [7]:
from ray import air
from ray import tune

results = tune.Tuner(
    "PPO",
    param_space=config,
    run_config=air.RunConfig(
        checkpoint_config=air.CheckpointConfig(
            checkpoint_at_end=True,
            checkpoint_frequency=50,
        ),
        # progress_reporter=CLIReporter(
        #     metric_columns=["episodes_total", "episode_len_mean", "policy_agent_1_reward", "policy_agent_2_reward", "episode_reward_mean"]
        # )
    ),
).fit()

0,1
Current time:,2023-08-21 17:04:38
Running for:,03:20:18.27
Memory:,8.3/15.9 GiB

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_othello_64531_00000,RUNNING,127.0.0.1:8152,1939,11799.2,7756000,-16.7815,36,-25,6.96187


[2m[36m(PPO pid=8152)[0m Install gputil for GPU system monitoring.
[2m[36m(PPO pid=8152)[0m Caught sync error: Sync process failed: [WinError 32] Failed copying 'C:/Users/nYoshiaki/ray_results/PPO/PPO_othello_64531_00000_0_2023-08-21_13-44-19/checkpoint_000050/.is_checkpoint' to 'c:///Users/nYoshiaki/ray_results/PPO/PPO_othello_64531_00000_0_2023-08-21_13-44-19/checkpoint_000050/.is_checkpoint'. Detail: [Windows error 32] �v���Z�X�̓t�@�C���ɃA�N�Z�X�ł��܂���B�ʂ̃v���Z�X���g�p���ł��B
[2m[36m(PPO pid=8152)[0m . Retrying after sleeping for 1.0 seconds...
[2m[36m(PPO pid=8152)[0m Caught sync error: Sync process failed: [WinError 32] Failed copying 'C:/Users/nYoshiaki/ray_results/PPO/PPO_othello_64531_00000_0_2023-08-21_13-44-19/checkpoint_000050/.is_checkpoint' to '/Users/nYoshiaki/ray_results/PPO/PPO_othello_64531_00000_0_2023-08-21_13-44-19/checkpoint_000050/.is_checkpoint'. Detail: [Windows error 32] �v���Z�X�̓t�@�C���ɃA�N�Z�X�ł��܂���B�ʂ̃v���Z�X���g�p���ł��B
[2m[36m(PPO pid=8

In [8]:
results

ResultGrid<[
  Result(
    metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'agent_1': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 2.7304934122910103, 'cur_kl_coeff': 0.42714843749999987, 'cur_lr': 5.0000000000000016e-05, 'total_loss': 5.640662694970767, 'policy_loss': -0.05690806179967088, 'vf_loss': 5.693944196899732, 'vf_explained_var': 0.06784498443206151, 'kl': 0.00849014313190916, 'entropy': 0.7519325812657675, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 125.25, 'num_grad_updates_lifetime': 930480.5, 'diff_num_grad_updates_vs_sampler_policy': 239.5}}, 'num_env_steps_sampled': 7756000, 'num_env_steps_trained': 7756000, 'num_agent_steps_sampled': 7755990, 'num_agent_steps_trained': 7755990}, 'sampler_results': {'episode_reward_max': 36.0, 'episode_reward_min': -25.0, 'episode_reward_mean': -16.78145580589255, 'episode_len_mean': 6.961871750433276, 'episode_media': {}, 'episodes_this_iter': 577, 'policy_

In [9]:
checkpoint = results.get_best_result().checkpoint
print(checkpoint)

Checkpoint(uri=c://\Users\nYoshiaki\ray_results\PPO\PPO_othello_64531_00000_0_2023-08-21_13-44-19\checkpoint_001900)


In [None]:
import ray
ray.shutdown()
register_env("othello", lambda _: OthelloEnv({}))

In [10]:
config.expolore = False
algo = config.build()
algo.restore(checkpoint)

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
2023-08-21 17:05:00,902	INFO trainable.py:172 -- Trainable.setup took 12.400 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2023-08-21 17:05:00,913	INFO checkpoint.py:580 -- Downloading checkpoint from c://\Users\nYoshiaki\ray_re

In [11]:
algo.get_policy("agent_1").export_model("models/othello_policy", onnx=18)

verbose: False, log level: Level.ERROR



In [None]:
import random
# multiagent environment for othello
env = OthelloEnv({})
obs, _ = env.reset()
terminated = False
current_player = "agent_1"
while not terminated:
    env.render()
    # action = env.action_space.sample()
    valid_actions = env.get_valid_moves(current_player)
    action = algo.compute_single_action(obs[current_player], policy_id=current_player)
    # if len(valid_actions) > 0:
    #     action = random.choice(valid_actions)
    # else:
    #     action = 64
    action = {current_player: action}
    print(action)
    obs, reward, terminated, truncated, _= env.step(action)
    reward = reward[current_player]
    terminated = terminated[current_player]
    truncated = truncated[current_player]
    current_player = env.current_player

.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|O|X|.|.|.
.|.|.|X|O|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.

{'agent_1': 29}
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|O|O|O|.|.
.|.|.|X|O|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.



TypeError: object of type 'int' has no len()

In [None]:
env.render()

.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|O|O|O|.|.
.|.|.|X|X|X|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.
.|.|.|.|.|.|.|.

