In [None]:
from pettingzoo.sisl import multiwalker_v9

env = multiwalker_v9.env(n_walkers=5, position_noise=1e-3, angle_noise=1e-3, forward_reward=1.0, terminate_reward=-100.0, fall_reward=-10.0, shared_reward=True,
terminate_on_fall=True, remove_on_fall=True, terrain_length=200, max_cycles=500, render_mode='human')
env.reset(seed=42)

for agent in env.agent_iter():
    observation, reward, termination, truncation, info = env.last()

    if termination or truncation:
        action = None
    else:
        # this is where you would insert your policy
        action = env.action_space(agent).sample()

    env.step(action)
env.close()

In [None]:
#parallel 

from pettingzoo.sisl import multiwalker_v9

env = multiwalker_v9.parallel_env(render_mode="human")
observations, infos = env.reset()

while env.agents:
    # this is where you would insert your policy
    actions = {agent: env.action_space(agent).sample() for agent in env.agents}

    observations, rewards, terminations, truncations, infos = env.step(actions)
env.close()

# MADDPG


In [None]:
import os

import numpy as np
import supersuit as ss
import torch
from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer
from agilerl.hpo.mutation import Mutations
from agilerl.hpo.tournament import TournamentSelection
from agilerl.utils.utils import initialPopulation
from tqdm import trange

from pettingzoo.sisl import multiwalker_v9
MultiAgentReplayBuffer
if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Define the network configuration
    NET_CONFIG = {
        "arch": "mlp",  # Network architecture
        "h_size": [32, 32],  # Network hidden size
    }

    # Define the initial hyperparameters
    INIT_HP = {
        "POPULATION_SIZE": 3,
        "ALGO": "MADDPG",  # Algorithm
        "CHANNELS_LAST": False,
        "BATCH_SIZE": 32,  # Batch size
        "LR": 0.01,  # Learning rate
        "GAMMA": 0.99,  # Discount factor
        "MEMORY_SIZE": 1000,  # Max memory buffer size
        "LEARN_STEP": 3,  # Learning frequency
        "TAU": 0.01,  # For soft update of target parameters
    }

    # Define the multiwalker environment as a parallel environment
    env = multiwalker_v9.parallel_env()
    env.reset()

    # Configure the multi-agent algo input arguments
    try:
        state_dim = [env.observation_space(agent).shape for agent in env.agents]
        one_hot = False
    except Exception:
        state_dim = [env.observation_space(agent).n for agent in env.agents]
        one_hot = True

    try:
        action_dim = [env.action_space(agent).shape[0] for agent in env.agents]
        INIT_HP["DISCRETE_ACTIONS"] = False
        INIT_HP["MAX_ACTION"] = [env.action_space(agent).high for agent in env.agents]
        INIT_HP["MIN_ACTION"] = [env.action_space(agent).low for agent in env.agents]
    except Exception:
        action_dim = [env.action_space(agent).n for agent in env.agents]
        INIT_HP["DISCRETE_ACTIONS"] = True
        INIT_HP["MAX_ACTION"] = None
        INIT_HP["MIN_ACTION"] = None

    # Append number of agents and agent IDs to the initial hyperparameter dictionary
    INIT_HP["N_AGENTS"] = env.num_agents
    INIT_HP["AGENT_IDS"] = env.agents

    # Create a population ready for evolutionary hyper-parameter optimisation
    pop = initialPopulation(
        INIT_HP["ALGO"],
        state_dim,
        action_dim,
        one_hot,
        NET_CONFIG,
        INIT_HP,
        population_size=INIT_HP["POPULATION_SIZE"],
        device=device,
    )

    # Configure the multi-agent replay buffer
    field_names = ["state", "action", "reward", "next_state", "done"]
    memory = MultiAgentReplayBuffer(
        INIT_HP["MEMORY_SIZE"],
        field_names=field_names,
        agent_ids=INIT_HP["AGENT_IDS"],
        device=device,
    )

    # Instantiate a tournament selection object (used for HPO)
    tournament = TournamentSelection(
        tournament_size=3,  # Tournament selection size
        elitism=True,  # Elitism in tournament selection
        population_size=INIT_HP["POPULATION_SIZE"],  # Population size
        evo_step=1,
    )  # Evaluate using last N fitness scores

    # Instantiate a mutations object (used for HPO)
    mutations = Mutations(
        algo=INIT_HP["ALGO"],
        no_mutation=0.2,  # Probability of no mutation
        architecture=0.2,  # Probability of architecture mutation
        new_layer_prob=0.2,  # Probability of new layer mutation
        parameters=0.2,  # Probability of parameter mutation
        activation=0,  # Probability of activation function mutation
        rl_hp=0.2,  # Probability of RL hyperparameter mutation
        rl_hp_selection=[
            "lr",
            "learn_step",
            "batch_size",
        ],  # RL hyperparams selected for mutation
        mutation_sd=0.1,  # Mutation strength
        # Define search space for each hyperparameter
        min_lr=0.0001,
        max_lr=0.01,
        min_learn_step=1,
        max_learn_step=120,
        min_batch_size=8,
        max_batch_size=64,
        agent_ids=INIT_HP["AGENT_IDS"],  # Agent IDs
        arch=NET_CONFIG["arch"],  # MLP or CNN
        rand_seed=1,
        device=device,
    )

    # Define training loop parameters
    max_episodes = 1000  # Total episodes (default: 6000)
    max_steps = 500  # Maximum steps to take in each episode
    epsilon = 1.0  # Starting epsilon value
    eps_end = 0.1  # Final epsilon value
    eps_decay = 0.995  # Epsilon decay
    evo_epochs = 50  # Evolution frequency
    evo_loop = 1  # Number of evaluation episodes
    elite = pop[0]  # Assign a placeholder "elite" agent

    # Training loop
        # Training loop
    for idx_epi in trange(max_episodes):
        state = env.reset()  # Reset environment at start of episode
        state = state[0]
        agent_reward = {agent_id: 0 for agent_id in env.agents}

        for _ in range(max_steps):
            actions = {}
            state_dict = {agent_id: state[agent_id] for agent_id in env.agents}
            for agent_idx, agent in enumerate(pop) :
                cont_actions, discrete_action = agent.getAction(state_dict, epsilon, agent_mask={env.agents[agent_idx]: True})
                action = discrete_action if agent.discrete_actions else cont_actions
                actions.update(action)

            next_state, reward, termination, truncation, info = env.step(actions)  # Act in environment

            # Save experiences to replay buffer
            memory.save2memory(state_dict, actions, reward, next_state, termination)

            # Collect the reward
            for agent_id, r in reward.items():
                agent_reward[agent_id] += r

            batch_size = pop[0].batch_size
            learn_step = pop[0].learn_step
            # Learn according to learning frequency
            if (memory.counter % learn_step == 0) and (len(memory) >= batch_size):
                experiences = memory.sample(batch_size)  # Sample replay buffer
                for agent_idx, agent in enumerate(pop) :
                    agent.learn(experiences)  # Learn according to agent's RL algorithm

            state = next_state  # Update the state

            # Stop episode if any agents have terminated
            if any(termination.values()) or any(truncation.values()):
                break

        # Save the total episode reward
        score = sum(agent_reward.values())
        for agent in pop:
            agent.scores.append(score)

        # Update epsilon for exploration
        epsilon = max(eps_end, epsilon * eps_decay)

        # Now evolve population if necessary
        if (idx_epi + 1) % evo_epochs == 0:
            # Evaluate population
            fitnesses = [
                agent.test(
                    env,
                    max_steps=max_steps,
                    loop=evo_loop,
                )
                for agent in pop
            ]

            print(f"Episode {idx_epi + 1}/{max_episodes}")
            print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}')
            print(
                f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}'
            )

            # Tournament selection and population mutation
            elite, pop = tournament.select(pop)
            pop = mutations.mutation(pop)

    # Save the trained algorithm
    path = "./models/MADDPG"
    filename = "MADDPG_trained_agent.pt"
    os.makedirs(path, exist_ok=True)
    save_path = os.path.join(path, filename)
    elite.saveCheckpoint(save_path)

  5%|▌         | 50/1000 [00:37<13:41,  1.16it/s]

Episode 50/1000
Fitnesses: ['-185.65', '-233.38', '-210.20']
100 fitness avgs: ['-185.65', '-233.38', '-210.20']


 10%|█         | 100/1000 [01:14<11:41,  1.28it/s]

Episode 100/1000
Fitnesses: ['-230.16', '-120.35', '-192.38']
100 fitness avgs: ['-207.91', '-176.86', '-189.01']


 15%|█▌        | 150/1000 [01:50<11:44,  1.21it/s]

Episode 150/1000
Fitnesses: ['-305.65', '-231.07', '-215.51']
100 fitness avgs: ['-219.79', '-203.03', '-189.74']


 20%|██        | 200/1000 [02:29<11:34,  1.15it/s]

Episode 200/1000
Fitnesses: ['-305.69', '-221.01', '-318.47']
100 fitness avgs: ['-218.73', '-197.56', '-244.46']


 25%|██▌       | 250/1000 [03:08<09:12,  1.36it/s]

Episode 250/1000
Fitnesses: ['-314.04', '-327.99', '-82.00']
100 fitness avgs: ['-220.86', '-240.58', '-191.38']


 30%|███       | 300/1000 [04:06<13:30,  1.16s/it]

Episode 300/1000
Fitnesses: ['-298.44', '-173.05', '-270.87']
100 fitness avgs: ['-209.23', '-188.33', '-204.63']


 35%|███▌      | 350/1000 [04:56<15:46,  1.46s/it]

Episode 350/1000
Fitnesses: ['-250.17', '-225.29', '-293.78']
100 fitness avgs: ['-197.16', '-207.58', '-217.37']


 40%|████      | 400/1000 [05:56<10:35,  1.06s/it]

Episode 400/1000
Fitnesses: ['-280.94', '-210.81', '-104.99']
100 fitness avgs: ['-216.75', '-207.99', '-194.76']


 45%|████▌     | 450/1000 [07:15<11:35,  1.27s/it]

Episode 450/1000
Fitnesses: ['-273.44', '-105.15', '-296.40']
100 fitness avgs: ['-203.50', '-184.80', '-206.05']


 50%|█████     | 500/1000 [08:33<10:06,  1.21s/it]

Episode 500/1000
Fitnesses: ['-215.88', '-205.97', '-143.21']
100 fitness avgs: ['-187.91', '-203.75', '-197.47']


 55%|█████▌    | 550/1000 [09:33<07:30,  1.00s/it]

Episode 550/1000
Fitnesses: ['-209.90', '-178.64', '-235.36']
100 fitness avgs: ['-198.60', '-195.76', '-200.92']


 60%|█████▉    | 599/1000 [10:55<08:51,  1.33s/it]

Episode 600/1000
Fitnesses: ['-227.91', '-280.49', '-237.36']
100 fitness avgs: ['-198.44', '-202.82', '-199.23']


 65%|██████▌   | 650/1000 [11:56<08:55,  1.53s/it]

Episode 650/1000
Fitnesses: ['-251.90', '-158.20', '-263.39']
100 fitness avgs: ['-202.55', '-196.07', '-203.44']


 70%|███████   | 700/1000 [13:07<18:00,  3.60s/it]

Episode 700/1000
Fitnesses: ['-262.34', '-325.21', '-208.99']
100 fitness avgs: ['-200.80', '-211.31', '-196.99']


 75%|███████▌  | 750/1000 [14:17<05:11,  1.25s/it]

Episode 750/1000
Fitnesses: ['-56.84', '-226.20', '-141.69']
100 fitness avgs: ['-187.65', '-202.50', '-193.31']


 80%|███████▉  | 799/1000 [15:30<03:18,  1.01it/s]

Episode 800/1000
Fitnesses: ['-345.08', '-269.09', '-346.74']
100 fitness avgs: ['-197.49', '-192.74', '-197.59']


 85%|████████▍ | 849/1000 [16:29<05:04,  2.02s/it]

Episode 850/1000
Fitnesses: ['-243.47', '-222.57', '-223.16']
100 fitness avgs: ['-195.72', '-194.50', '-194.53']


 90%|█████████ | 900/1000 [17:42<01:39,  1.00it/s]

Episode 900/1000
Fitnesses: ['-311.42', '-247.22', '-196.90']
100 fitness avgs: ['-200.99', '-197.42', '-194.63']


 95%|█████████▍| 949/1000 [18:36<02:29,  2.93s/it]

Episode 950/1000
Fitnesses: ['-193.16', '-248.68', '-178.70']
100 fitness avgs: ['-194.55', '-203.50', '-193.79']


100%|█████████▉| 999/1000 [19:35<00:00,  1.16it/s]

Episode 1000/1000
Fitnesses: ['-299.17', '-167.78', '-213.98']
100 fitness avgs: ['-199.06', '-193.21', '-195.52']


100%|██████████| 1000/1000 [19:37<00:00,  1.18s/it]


In [5]:
import os
import imageio
import numpy as np
import torch
from agilerl.algorithms.maddpg import MADDPG
from PIL import Image, ImageDraw
from pettingzoo.sisl import multiwalker_v9  # Import the correct environment


# Define function to label the frame with episode number
def _label_with_episode_number(frame, episode_num):
    im = Image.fromarray(frame)
    drawer = ImageDraw.Draw(im)
    text_color = (255, 255, 255) if np.mean(frame) < 128 else (0, 0, 0)
    drawer.text((im.size[0] / 20, im.size[1] / 18), f"Episode: {episode_num+1}", fill=text_color)
    return im


if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Configure the environment with render_mode "rgb_array"
    env = multiwalker_v9.parallel_env(render_mode="rgb_array")
    env.reset()

    # Define the observation and action dimensions based on Multiwalker environment
    state_dim = [env.observation_space(agent).shape for agent in env.agents]
    one_hot = False
    action_dim = [env.action_space(agent).shape[0] for agent in env.agents]
    discrete_actions = False
    max_action = [env.action_space(agent).high for agent in env.agents]
    min_action = [env.action_space(agent).low for agent in env.agents]

    n_agents = env.num_agents
    agent_ids = env.agents

    # Instantiate an MADDPG object
    maddpg = MADDPG(
        state_dim,
        action_dim,
        one_hot,
        n_agents,
        agent_ids,
        max_action,
        min_action,
        discrete_actions,
        device=device,
    )

    # Load the trained MADDPG model
    path = "./models/MADDPG/MADDPG_trained_agent.pt"
    maddpg.loadCheckpoint(path)

    # Set testing parameters
    episodes = 10
    max_steps = 500

    rewards = []
    frames = []
    indi_agent_rewards = {agent_id: [] for agent_id in agent_ids}

    # Testing loop
    for ep in range(episodes):
        state, info = env.reset()
        agent_reward = {agent_id: 0 for agent_id in agent_ids}
        score = 0

        for _ in range(max_steps):
            # Get next action from agent
            cont_actions, _ = maddpg.getAction(state, epsilon=0)
            action = cont_actions  # Only continuous actions are used

            # Render and save the frame for video creation
            frame = env.render()  # Now renders an RGB array
            if frame is not None:
                frames.append(_label_with_episode_number(frame, episode_num=ep))

            # Step in the environment
            state, reward, termination, truncation, info = env.step(action)

            # Collect rewards for each agent
            for agent_id, r in reward.items():
                agent_reward[agent_id] += r

            # Sum the total episode reward
            score = sum(agent_reward.values())

            # End episode if any agents have terminated
            if any(truncation.values()) or any(termination.values()):
                break

        rewards.append(score)

        # Store individual agent rewards
        for agent_id in agent_ids:
            indi_agent_rewards[agent_id].append(agent_reward[agent_id])

        print("-" * 15, f"Episode: {ep}", "-" * 15)
        print("Episodic Reward: ", rewards[-1])
        for agent_id, reward_list in indi_agent_rewards.items():
            print(f"{agent_id} reward: {reward_list[-1]}")

    env.close()

    # Save frames as an MP4 video
    video_path = "./videos/"
    os.makedirs(video_path, exist_ok=True)
    with imageio.get_writer(os.path.join(video_path, "multiwalker.mp4"), fps=30) as writer:
        for frame in frames:
            writer.append_data(np.array(frame))


--------------- Episode: 0 ---------------
Episodic Reward:  -308.13980266451836
walker_0 reward: -102.71326755483945
walker_1 reward: -102.71326755483945
walker_2 reward: -102.71326755483945
--------------- Episode: 1 ---------------
Episodic Reward:  -309.0874107182026
walker_0 reward: -103.02913690606754
walker_1 reward: -103.02913690606754
walker_2 reward: -103.02913690606754
--------------- Episode: 2 ---------------
Episodic Reward:  -307.6405560821295
walker_0 reward: -102.5468520273765
walker_1 reward: -102.5468520273765
walker_2 reward: -102.5468520273765
--------------- Episode: 3 ---------------
Episodic Reward:  -321.2203716933727
walker_0 reward: -107.07345723112425
walker_1 reward: -107.07345723112425
walker_2 reward: -107.07345723112425
--------------- Episode: 4 ---------------
Episodic Reward:  -310.38620471954346
walker_0 reward: -103.46206823984781
walker_1 reward: -103.46206823984781
walker_2 reward: -103.46206823984781
--------------- Episode: 5 ---------------
Epi



--------------- Episode: 9 ---------------
Episodic Reward:  -308.84262354299426
walker_0 reward: -102.94754118099809
walker_1 reward: -102.94754118099809
walker_2 reward: -102.94754118099809


