In [None]:
import gymnasium as gym
from agilerl.components.replay_buffer import ReplayBuffer
from agilerl.algorithms.td3 import TD3
from pettingzoo.sisl import multiwalker_v9
import torch
from agilerl.utils.utils import initialPopulation


# Create environment and Experience Replay Buffer
num_envs = 1
env = multiwalker_v9.parallel_env(render_mode="human")

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Define the network configuration
    NET_CONFIG = {
        "arch": "mlp",  # Network architecture
        "h_size": [32, 32],  # Network hidden size
    }

    INIT_HP = {
        "POPULATION_SIZE": 3,
        "ALGO": "TD3",  # Algorithm
        "CHANNELS_LAST": False,
        "BATCH_SIZE": 32,  # Batch size
        "LR": 0.01,  # Learning rate
        "GAMMA": 0.99,  # Discount factor
        "MEMORY_SIZE": 1000,  # Max memory buffer size
        "LEARN_STEP": 3,  # Learning frequency
        "TAU": 0.01,  # For soft update of target parameters
    }
    
    # Configure the multi-agent algo input arguments
    try:
        state_dim = [env.observation_space(agent).shape for agent in env.agents]
        one_hot = False
    except Exception:
        state_dim = [env.observation_space(agent).n for agent in env.agents]
        one_hot = True

    try:
        action_dim = [env.action_space(agent).shape[0] for agent in env.agents]
        INIT_HP["DISCRETE_ACTIONS"] = False
        INIT_HP["MAX_ACTION"] = [env.action_space(agent).high for agent in env.agents]
        INIT_HP["MIN_ACTION"] = [env.action_space(agent).low for agent in env.agents]
    except Exception:
        action_dim = [env.action_space(agent).n for agent in env.agents]
        INIT_HP["DISCRETE_ACTIONS"] = True
        INIT_HP["MAX_ACTION"] = None
        INIT_HP["MIN_ACTION"] = None
    
    # Create a population ready for evolutionary hyper-parameter optimisation
    pop = initialPopulation(
        INIT_HP["ALGO"],
        state_dim,
        action_dim,
        one_hot,
        NET_CONFIG,
        INIT_HP,
        population_size=INIT_HP["POPULATION_SIZE"],
        device=device,
    ) 
    
    # Configure the multi-agent replay buffer
    field_names = ["state", "action", "reward", "next_state", "done"]
    
    memory = []
    for i in range(INIT_HP["POPULATION_SIZE"]):
        memory.append(ReplayBuffer(memory_size=10000, field_names=field_names, action_dim=action_dim[i]))


# agent = TD3(state_dim=state_dim, action_dim=action_dim, one_hot=one_hot, max_action=max_action)   # Create TD3 agent

state = env.reset()[0]  # Reset environment at start of episode
while True:
    action = agent.getAction(state)    # Get next action from agent
    next_state, reward, done, _, _ = env.step(action)   # Act in environment

    # Save experience to replay buffer
    if channels_last:
        memory.save2emoryVectEnvs(state, action, reward, np.moveaxis(next_state, [-1], [-3]), done)
    else:
        memory.save2memoryVectEnvs(state, action, reward, next_state, done)

    # Learn according to learning frequency
    if len(memory) >= agent.batch_size:
        experiences = memory.sample(agent.batch_size) # Sample replay buffer
        agent.learn(experiences)    # Learn according to agent's RL algorithm

AttributeError: 'aec_to_parallel_wrapper' object has no attribute 'single_action_space'

In [10]:
import numpy as np
import torch
from agilerl.algorithms.td3 import TD3
from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer
from agilerl.utils.utils import initialPopulation
from pettingzoo.sisl import multiwalker_v9
from tqdm import trange

# Configure the environment
env = multiwalker_v9.parallel_env(render_mode="human")
env.reset()

# Configure the multi-agent algo input arguments
try:
    state_dim = [env.observation_space(agent).shape[0] for agent in env.agents]
    one_hot = False
except Exception:
    state_dim = [env.observation_space(agent).n for agent in env.agents]
    one_hot = True

try:
    action_dim = [env.action_space(agent).shape[0] for agent in env.agents]
    discrete_actions = False
    max_action = [env.action_space(agent).high for agent in env.agents]
    min_action = [env.action_space(agent).low for agent in env.agents]
except Exception:
    action_dim = [env.action_space(agent).n for agent in env.agents]
    discrete_actions = True
    max_action = None
    min_action = None

# Define the network configuration
NET_CONFIG = {
    "arch": "mlp",  # Network architecture
    "h_size": [128, 128],  # Network hidden size
}

# Define the initial hyperparameters
INIT_HP = {
    "POPULATION_SIZE": 2,
    "ALGO": "TD3",  # Algorithm
    "CHANNELS_LAST": False,
    "BATCH_SIZE": 64,  # Batch size
    "LR": 0.001,  # Learning rate
    "GAMMA": 0.95,  # Discount factor
    "MEMORY_SIZE": 50000,  # Max memory buffer size
    "LEARN_STEP": 50,  # Learning frequency
    "TAU": 0.01,  # For soft update of target parameters
    "DISCRETE_ACTIONS": discrete_actions,
    "MAX_ACTION": max_action,
    "MIN_ACTION": min_action,
}

# Append number of agents and agent IDs to the initial hyperparameter dictionary
INIT_HP["N_AGENTS"] = env.num_agents
INIT_HP["AGENT_IDS"] = env.agents

# Create a population ready for evolutionary hyper-parameter optimisation
pop = initialPopulation(
    INIT_HP["ALGO"],
    state_dim,
    action_dim,
    one_hot,
    NET_CONFIG,
    INIT_HP,
    population_size=INIT_HP["POPULATION_SIZE"],
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
)

# Configure the multi-agent replay buffer
field_names = ["state", "action", "reward", "next_state", "done"]
memory = MultiAgentReplayBuffer(
    INIT_HP["MEMORY_SIZE"],
    field_names=field_names,
    agent_ids=INIT_HP["AGENT_IDS"],
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
)

# Training loop parameters
max_episodes = 5  # Total episodes (default: 6000)
max_steps = 900  # Maximum steps to take in each episode
epsilon = 1.0  # Starting epsilon value
eps_end = 0.1  # Final epsilon value
eps_decay = 0.995  # Epsilon decay

# Training loop
for idx_epi in trange(max_episodes):
    state = env.reset()  # Reset environment at start of episode
    agent_reward = {agent_id: 0 for agent_id in env.agents}

    for _ in range(max_steps):
        actions = {}
        state_dict = {agent_id: state[agent_id] for agent_id in env.agents}

        for agent_idx, agent in enumerate(pop):
            cont_actions, discrete_action = agent.getAction(
                state_dict, epsilon, agent_mask={env.agents[agent_idx]: True}
            )
            action = discrete_action if agent.discrete_actions else cont_actions
            actions.update(action)

        next_state, reward, termination, truncation, info = env.step(actions)  # Act in environment

        # Save experiences to replay buffer
        memory.save2memory(state_dict, actions, reward, next_state, termination)

        # Collect the reward
        for agent_id, r in reward.items():
            agent_reward[agent_id] += r

        # Learn according to learning frequency
        for agent_idx, agent in enumerate(pop):
            agent_id = env.agents[agent_idx]
            if (memory.counter % agent.learn_step == 0) and (len(memory) >= agent.batch_size):
                experiences = memory.sample(agent.batch_size)  # Sample replay buffer
                agent.learn(experiences)  # Learn according to agent's RL algorithm

        state = next_state  # Update the state

        # Stop episode if any agents have terminated
        if any(termination.values()) or any(truncation.values()):
            break

    # Save the total episode reward
    score = sum(agent_reward.values())
    for agent in pop:
        agent.scores.append(score)

    # Update epsilon for exploration
    epsilon = max(eps_end, epsilon * eps_decay)

# Save the trained algorithm
path = "./models/TD3"
filename = "TD3_trained_agent.pt"
os.makedirs(path, exist_ok=True)
save_path = os.path.join(path, filename)
elite.saveCheckpoint(save_path)

KeyError: 'POLICY_FREQ'