In [9]:
# Import necessary modules
from pettingzoo.utils.env import ParallelEnv
from gymnasium import spaces
from ray.rllib.env import ParallelPettingZooEnv
import numpy as np
from ray.rllib.algorithms.ppo import PPOConfig
import ray
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray import tune
from ray.tune.registry import register_env

In [10]:
class PrisonersDilemmaParallel(ParallelEnv):
    def __init__(self):
        self.possible_agents = ["agent_0", "agent_1"]
        self.action_spaces = {agent: spaces.Discrete(2) for agent in self.possible_agents}
        self.observation_spaces = {
            agent: spaces.Box(low=0, high=1, shape=(2,), dtype=np.int8) for agent in self.possible_agents
        }
        self.reset()

    def reset(self):
        self.agents = self.possible_agents[:]
        self.dones = {agent: False for agent in self.possible_agents}
        self.rewards = {agent: 0 for agent in self.possible_agents}
        self.cumulative_reward = {agent : 0 for agent in self.possible_agents}
        self.observations = {agent: np.array([0, 0]) for agent in self.possible_agents}
        self.infos = {agent: {} for agent in self.possible_agents}
        return self.observations

    def step(self, actions):
        if not all(agent in actions for agent in self.possible_agents):
            raise ValueError("All agents must have an action")

        action_0, action_1 = actions["agent_0"], actions["agent_1"]

        if action_0 == 0 and action_1 == 0:
            rewards = [1, 1]
        elif action_0 == 1 and action_1 == 0:
            rewards = [10, 0]
        elif action_0 == 0 and action_1 == 1:
            rewards = [0, 10]
        else:
            rewards = [0, 0]

        self.rewards = {"agent_0": rewards[0], "agent_1": rewards[1]}
        for agent, cumul_reward in self.rewards.items():
            self.cumulative_reward[agent] += self.rewards[agent]
        self.dones = {"agent_0": False, "agent_1": False}
        self.observations = {
            "agent_0": np.array([actions["agent_0"], actions["agent_1"]]),
            "agent_1": np.array([actions["agent_1"], actions["agent_0"]]),
        }
        return self.observations, self.rewards, self.dones, self.infos

    def render(self):
        print(f"Agent 0: {self.observations['agent_0']}, Turn Reward: {self.rewards['agent_0']}, Cumul Reward : {self.cumulative_reward['agent_0']}")
        print(f"Agent 1: {self.observations['agent_1']}, Turn Reward: {self.rewards['agent_1']}, Cumul Reward : {self.cumulative_reward['agent_1']}")

    def close(self):
        pass

    def observation_space(self, agent):
        return self.observation_spaces[agent]

    def action_space(self, agent):
        return self.action_spaces[agent]

In [11]:
def launch_qq_tours(nb_tours: int = 5):
    # Initialize the environment
    env = PrisonersDilemmaParallel()

    # Run 5 turns of the game
    env.reset()
    for turn in range(5):
        actions = {
            "agent_0": np.random.choice([0, 1]),  # Random action for agent 0
            "agent_1": np.random.choice([0, 1])   # Random action for agent 1
        }
        observations, rewards, dones, infos = env.step(actions)
        print(f"Turn {turn + 1}:")
        env.render()

In [12]:
# Classe encapsulant l'environnement parallèle dans un environnement multi-agent
class RLlibPrisonersDilemma(MultiAgentEnv):
    def __init__(self):
        self.env = PrisonersDilemmaParallel()
        self.agents = self.env.possible_agents

    def reset(self):
        observations = self.env.reset()
        return {agent: observations[agent] for agent in self.agents}

    def step(self, action_dict):
        observations, rewards, dones, infos = self.env.step(action_dict)
        return (
            {agent: observations[agent] for agent in self.agents},
            {agent: rewards[agent] for agent in self.agents},
            {agent: dones[agent] for agent in self.agents},
            {agent: infos[agent] for agent in self.agents},
        )

    def render(self):
        self.env.render()

    def observation_space(self, agent):
        return self.env.observation_space(agent)

    def action_space(self, agent):
        return self.env.action_space(agent)

In [13]:
# Création de l'environnement
def env_creator(_):
    return RLlibPrisonersDilemma()

# Enregistrement de l'environnement
register_env("prisoners_dilemma_parallel", env_creator)

# Configuration de l'entraînement
config = {
    "env": "prisoners_dilemma_parallel",
    "framework": "torch",  # or "tf"
    "num_gpus": 0,
    "num_workers": 1,
    "multiagent": {
        "policies": {
            "policy_0": (None, spaces.Box(low=0, high=1, shape=(2,), dtype=np.int8), spaces.Discrete(2), {}),
            "policy_1": (None, spaces.Box(low=0, high=1, shape=(2,), dtype=np.int8), spaces.Discrete(2), {}),
        },
        "policy_mapping_fn": lambda agent_id: "policy_0" if agent_id == "agent_0" else "policy_1",
    },
}

# Initialisation de Ray
ray.init(ignore_reinit_error=True)

# Lancement de l'entraînement
tune.run(
    "PPO",
    stop={"episodes_total": 5000},
    config=config
)

# Arrêt de Ray
ray.shutdown()

2024-06-06 11:25:01,222	INFO worker.py:1582 -- Calling ray.init() again after it has already been called.
2024-06-06 11:25:01,223	INFO tune.py:614 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2024-06-06 11:25:08
Running for:,00:00:07.49
Memory:,17.6/31.7 GiB

Trial name,# failures,error file
PPO_prisoners_dilemma_parallel_a65be_00000,1,C:/Users/P23B6~1.ARC/AppData/Local/Temp/ray/session_2024-06-06_11-20-03_814344_528/artifacts/2024-06-06_11-25-01/PPO_2024-06-06_11-25-01/driver_artifacts/PPO_prisoners_dilemma_parallel_a65be_00000_0_2024-06-06_11-25-01/error.txt

Trial name,status,loc
PPO_prisoners_dilemma_parallel_a65be_00000,ERROR,127.0.0.1:22864


2024-06-06 11:25:08,703	ERROR tune_controller.py:1331 -- Trial task failed for trial PPO_prisoners_dilemma_parallel_a65be_00000
Traceback (most recent call last):
  File "c:\Users\p.archipczuk\AppData\Local\anaconda3\envs\RL\Lib\site-packages\ray\air\execution\_internal\event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "c:\Users\p.archipczuk\AppData\Local\anaconda3\envs\RL\Lib\site-packages\ray\_private\auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\p.archipczuk\AppData\Local\anaconda3\envs\RL\Lib\site-packages\ray\_private\client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\p.archipczuk\AppData\Local\anaconda3\envs\RL\Lib\site-packages\ray\_private\worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
           

Trial name
PPO_prisoners_dilemma_parallel_a65be_00000


2024-06-06 11:25:08,714	INFO tune.py:1007 -- Wrote the latest version of all result files and experiment state to 'C:/Users/p.archipczuk/ray_results/PPO_2024-06-06_11-25-01' in 0.0059s.


TuneError: ('Trials did not complete', [PPO_prisoners_dilemma_parallel_a65be_00000])

In [15]:
import ray
from ray import tune
from ray.tune.registry import register_env
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from gymnasium import spaces
import numpy as np
from pettingzoo.utils.env import ParallelEnv

# Environnement du Dilemme du Prisonnier
class PrisonersDilemmaParallel(ParallelEnv):
    def __init__(self):
        self.possible_agents = ["agent_0", "agent_1"]
        self.action_spaces = {agent: spaces.Discrete(2) for agent in self.possible_agents}
        self.observation_spaces = {
            agent: spaces.Box(low=0, high=1, shape=(2,), dtype=np.int8) for agent in self.possible_agents
        }
        self.reset()

    def reset(self, seed=None, options=None):
        self.agents = self.possible_agents[:]
        self.dones = {agent: False for agent in self.possible_agents}
        self.rewards = {agent: 0 for agent in self.possible_agents}
        self.observations = {agent: np.array([0, 0]) for agent in self.possible_agents}
        self.infos = {agent: {} for agent in self.possible_agents}
        return self.observations

    def step(self, actions):
        if not all(agent in actions for agent in self.possible_agents):
            raise ValueError("All agents must have an action")

        action_0, action_1 = actions["agent_0"], actions["agent_1"]

        if action_0 == 0 and action_1 == 0:
            rewards = [100, 100]
        elif action_0 == 1 and action_1 == 0:
            rewards = [200, 0]
        elif action_0 == 0 and action_1 == 1:
            rewards = [0, 200]
        else:
            rewards = [0, 0]

        self.rewards = {"agent_0": rewards[0], "agent_1": rewards[1]}
        self.dones = {"agent_0": False, "agent_1": False}
        self.observations = {
            "agent_0": np.array([actions["agent_0"], actions["agent_1"]]),
            "agent_1": np.array([actions["agent_1"], actions["agent_0"]]),
        }
        return self.observations, self.rewards, self.dones, self.infos

    def render(self):
        print(f"Agent 0: {self.observations['agent_0']}, Reward: {self.rewards['agent_0']}")
        print(f"Agent 1: {self.observations['agent_1']}, Reward: {self.rewards['agent_1']}")

    def close(self):
        pass

    def observation_space(self, agent):
        return self.observation_spaces[agent]

    def action_space(self, agent):
        return self.action_spaces[agent]

# Classe encapsulant l'environnement parallèle dans un environnement multi-agent
class RLlibPrisonersDilemma(MultiAgentEnv):
    def __init__(self):
        self.env = PrisonersDilemmaParallel()
        self.agents = self.env.possible_agents

    def reset(self, seed=None, options=None):
        observations = self.env.reset(seed=seed)
        return {agent: observations[agent] for agent in self.agents}

    def step(self, action_dict):
        observations, rewards, dones, infos = self.env.step(action_dict)
        return (
            {agent: observations[agent] for agent in self.agents},
            {agent: rewards[agent] for agent in self.agents},
            {agent: dones[agent] for agent in self.agents},
            {agent: infos[agent] for agent in self.agents},
        )

    def render(self):
        self.env.render()

    def observation_space(self, agent):
        return self.env.observation_space(agent)

    def action_space(self, agent):
        return self.env.action_space(agent)

# Création de l'environnement
def env_creator(_):
    return RLlibPrisonersDilemma()

# Enregistrement de l'environnement
register_env("prisoners_dilemma_parallel", env_creator)

# Configuration de l'entraînement
config = {
    "env": "prisoners_dilemma_parallel",
    "framework": "torch",  # or "tf"
    "num_gpus": 0,
    "num_workers": 1,
    "multiagent": {
        "policies": {
            "policy_0": (None, spaces.Box(low=0, high=1, shape=(2,), dtype=np.int8), spaces.Discrete(2), {}),
            "policy_1": (None, spaces.Box(low=0, high=1, shape=(2,), dtype=np.int8), spaces.Discrete(2), {}),
        },
        "policy_mapping_fn": lambda agent_id: "policy_0" if agent_id == "agent_0" else "policy_1",
    },
}

# Initialisation de Ray
ray.init(ignore_reinit_error=True)

# Lancement de l'entraînement
tune.run(
    "PPO",
    stop={"episodes_total": 5000},
    config=config
)

# Arrêt de Ray
ray.shutdown()


2024-06-06 11:26:34,988	INFO worker.py:1582 -- Calling ray.init() again after it has already been called.
2024-06-06 11:26:34,989	INFO tune.py:614 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2024-06-06 11:26:42
Running for:,00:00:07.40
Memory:,17.7/31.7 GiB

Trial name,# failures,error file
PPO_prisoners_dilemma_parallel_de3f4_00000,1,C:/Users/P23B6~1.ARC/AppData/Local/Temp/ray/session_2024-06-06_11-20-03_814344_528/artifacts/2024-06-06_11-26-34/PPO_2024-06-06_11-26-34/driver_artifacts/PPO_prisoners_dilemma_parallel_de3f4_00000_0_2024-06-06_11-26-34/error.txt

Trial name,status,loc
PPO_prisoners_dilemma_parallel_de3f4_00000,ERROR,127.0.0.1:7960


2024-06-06 11:26:42,380	ERROR tune_controller.py:1331 -- Trial task failed for trial PPO_prisoners_dilemma_parallel_de3f4_00000
Traceback (most recent call last):
  File "c:\Users\p.archipczuk\AppData\Local\anaconda3\envs\RL\Lib\site-packages\ray\air\execution\_internal\event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "c:\Users\p.archipczuk\AppData\Local\anaconda3\envs\RL\Lib\site-packages\ray\_private\auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\p.archipczuk\AppData\Local\anaconda3\envs\RL\Lib\site-packages\ray\_private\client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\p.archipczuk\AppData\Local\anaconda3\envs\RL\Lib\site-packages\ray\_private\worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
           

Trial name
PPO_prisoners_dilemma_parallel_de3f4_00000


2024-06-06 11:26:42,389	INFO tune.py:1007 -- Wrote the latest version of all result files and experiment state to 'C:/Users/p.archipczuk/ray_results/PPO_2024-06-06_11-26-34' in 0.0050s.


TuneError: ('Trials did not complete', [PPO_prisoners_dilemma_parallel_de3f4_00000])