In [None]:
import gymnasium as gym
from agilerl.components.replay_buffer import ReplayBuffer
from agilerl.algorithms.ddpg import DDPG
from pettingzoo.sisl import multiwalker_v9
import numpy as np


# Create environment and Experience Replay Buffer
num_envs = 1
# env = multiwalker_v9.parallel_env(render_mode='human')
env = multiwalker_v9.parallel_env(forward_reward=100)
env.reset()
try:
    state_dim = [env.observation_space(agent).shape[0] for agent in env.agents]
    one_hot = False
    
except Exception:
    state_dim = [env.observation_space(agent).n for agent in env.agents]
    one_hot = True
# Requires one-hot encoding
try:
    action_dim = [env.action_space(agent).shape[0] for agent in env.agents]
except Exception:
    action_dim = [env.action_space(agent).n for agent in env.agents]

field_names = ["state", "action", "reward", "next_state", "done"]
memory = ReplayBuffer(memory_size=300, field_names=field_names, action_dim=action_dim[0])
agents = {a:DDPG(state_dim=state_dim, action_dim=action_dim[0], one_hot=one_hot) for a in env.agents}   # Create DDPG agents
reward_evolution = []
rewards = []

for i in range(20):
    print(i, "th iteration \nrewards mean : ", np.sum(rewards))
    reward_evolution.append(np.sum(rewards))
    
    state = env.reset()[0]  # Reset environment at start of episode
    # state = state['walker_0']
    done = [False]
    rewards = []
    
    print(state)

    while done[0] == False:
        actions = {a:agent.getAction(state[a])[0] for a, agent in agents.items()}    # Get next action from agent
        # actions = {agent: actions for agent in env.agents}
        next_state, reward, done, _, _ = env.step(actions)   # Act in environment
        if 'walker_0' in next_state.keys():
            next_state = next_state['walker_0']
            done = [True] if done['walker_0'] or done['walker_1'] or done['walker_2'] else [False]
        else :
            next_state = np.empty((31,))
            done = [True]
        reward = reward['walker_0']
        rewards.append(reward)
        memory.save2memory(state, actions, reward, next_state, done)

        state = next_state

        # Learn according to learning frequency
        if len(memory) >= agent.batch_size:
            experiences = memory.sample(agent.batch_size) # Sample replay buffer
            agent.learn(experiences)    # Learn according to agent's RL algorithm


0 th iteration 
rewards mean :  0.0
{'walker_0': array([0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 1.6236708e-01, 1.6546237e-01,
       4.4486943e-01, 4.7198775e-01, 5.1494205e-01, 5.8084798e-01,
       6.8370843e-01, 8.5414934e-01, 9.1396463e-01, 8.7226135e-01,
       0.0000000e+00, 0.0000000e+00, 3.4010395e-01, 5.7477987e-04,
       3.3892679e-01, 8.2960449e-02, 2.8945488e-04], dtype=float32), 'walker_1': array([ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
        0.0000000e+00,  0.0000000e+00,  1.0000000e+00,  0.0000000e+00,
        0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  1.0000000e+00,
        0.0000000e+00,  0.0000000e+00,  1.6236708e-01,  1.6546237e-01,
        4.4205815e-01,  4.6539271e-01,  5.0883532e-01,  5.7480043e-01,
        6.8294215e-01,  8.7188619e-01,  9.1396