In [9]:
import gymnasium as gym
from agilerl.components.replay_buffer import ReplayBuffer
from agilerl.algorithms.ddpg import DDPG
from pettingzoo.sisl import multiwalker_v9
import numpy as np


# Create environment and Experience Replay Buffer
num_envs = 1
# env = multiwalker_v9.parallel_env(render_mode='human')
env = multiwalker_v9.parallel_env(forward_reward=10)
env.reset()
try:
    state_dim = [env.observation_space(agent).shape[0] for agent in env.agents]
    one_hot = False
    
except Exception:
    state_dim = [env.observation_space(agent).n for agent in env.agents]
    one_hot = True
# Requires one-hot encoding
try:
    action_dim = [env.action_space(agent).shape[0] for agent in env.agents]
except Exception:
    action_dim = [env.action_space(agent).n for agent in env.agents]
    
field_names = ["state", "action", "reward", "next_state", "done"]
memory = ReplayBuffer(memory_size=500, field_names=field_names, action_dim=action_dim[0])
agent = DDPG(state_dim=state_dim, action_dim=action_dim[0], one_hot=one_hot)   # Create DDPG agent
rewards = []

for i in range(1000):
    print(i, "th iteration \nrewards mean : ", np.sum(rewards))
    
    state = env.reset()[0]  # Reset environment at start of episode
    state = state['walker_0']
    done = [False]
    rewards = []
    

    while done[0] == False:
        action = agent.getAction(state)[0]    # Get next action from agent
        actions = {agent: action for agent in env.agents}
        next_state, reward, done, _, _ = env.step(actions)   # Act in environment
        if 'walker_0' in next_state.keys():
            next_state = next_state['walker_0']
            done = [True] if done['walker_0'] or done['walker_1'] or done['walker_2'] else [False]
        else :
            next_state = np.empty((31,))
            done = [True]
        reward = reward['walker_0']
        rewards.append(reward)
        memory.save2memory(state, action, reward, next_state, done)

        state = next_state

        # Learn according to learning frequency
        if len(memory) >= agent.batch_size:
            experiences = memory.sample(agent.batch_size) # Sample replay buffer
            agent.learn(experiences)    # Learn according to agent's RL algorithm

0 th iteration 
rewards mean :  0.0
1 th iteration 
rewards mean :  -106.26932568351428
2 th iteration 
rewards mean :  -102.54361574848494
3 th iteration 
rewards mean :  -115.02610028783481
4 th iteration 
rewards mean :  -107.04760018984477
5 th iteration 
rewards mean :  -110.98648107051851
6 th iteration 
rewards mean :  -101.50973057746887
7 th iteration 
rewards mean :  -100.69636671741804
8 th iteration 
rewards mean :  -110.7435751358668
9 th iteration 
rewards mean :  -108.28732281923294
10 th iteration 
rewards mean :  -106.00361476341882
11 th iteration 
rewards mean :  -105.5601915717125
12 th iteration 
rewards mean :  -107.76516040166219
13 th iteration 
rewards mean :  -111.06254919370016
14 th iteration 
rewards mean :  -118.40700884660086
15 th iteration 
rewards mean :  -113.79928518335025
16 th iteration 
rewards mean :  -1.2063116294642315
17 th iteration 
rewards mean :  -0.4359728967150077
18 th iteration 
rewards mean :  -0.14456207553545636
19 th iteration 
rew

In [11]:
import os
# Save the trained algorithm
path = "./models/ONE_DDPG"
filename = "DDPG_trained_agent.pt"
os.makedirs(path, exist_ok=True)
save_path = os.path.join(path, filename)
agent.saveCheckpoint(save_path)

displays 10 iterations of trained agent 

In [None]:

env = multiwalker_v9.parallel_env(render_mode='human')


field_names = ["state", "action", "reward", "next_state", "done"]
memory = ReplayBuffer(memory_size=500, field_names=field_names, action_dim=action_dim[0])
agent = DDPG(state_dim=state_dim, action_dim=action_dim[0], one_hot=one_hot)   # Create DDPG agent
# Load the trained MADDPG model
path = "./models/ONE_DDPG/DDPG_trained_agent.pt"
DDPG.loadCheckpoint(agent,path=path)
rewards = []

for i in range(10):
    print(i, "th iteration \nrewards mean : ", np.sum(rewards))
    
    state = env.reset()[0]  # Reset environment at start of episode
    state = state['walker_0']
    done = [False]
    rewards = []
    

    while done[0] == False:
        action = agent.getAction(state)[0]    # Get next action from agent
        actions = {agent: action for agent in env.agents}
        next_state, reward, done, _, _ = env.step(actions)   # Act in environment
        if 'walker_0' in next_state.keys():
            next_state = next_state['walker_0']
            done = [True] if done['walker_0'] or done['walker_1'] or done['walker_2'] else [False]
        else :
            next_state = np.empty((31,))
            done = [True]
        reward = reward['walker_0']
        rewards.append(reward)
        memory.save2memory(state, action, reward, next_state, done)

        state = next_state

        # Learn according to learning frequency
        if len(memory) >= agent.batch_size:
            experiences = memory.sample(agent.batch_size) # Sample replay buffer
            agent.learn(experiences)    # Learn according to agent's RL algorithm


0 th iteration 
rewards mean :  0.0
1 th iteration 
rewards mean :  -1.0646911611159664
2 th iteration 
rewards mean :  -1.5638678545753184
3 th iteration 
rewards mean :  -1.8441924452781704
4 th iteration 
rewards mean :  -0.15699916829666005
5 th iteration 
rewards mean :  -0.7568547911942028
6 th iteration 
rewards mean :  0.7100427613283173
7 th iteration 
rewards mean :  0.9349573043485435
8 th iteration 
rewards mean :  -0.4564826339483259
9 th iteration 
rewards mean :  2.515440544734396


: 