# Project 4: Multi-Agent RL Baseline

Train multiple agents to cooperate in the Simple Spread environment.

**Runtime:** ~5 minutes for baseline

## Setup

In [None]:
!pip install stable-baselines3 pettingzoo[mpe] supersuit -q

In [None]:
from pettingzoo.mpe import simple_spread_v3
import supersuit as ss
import numpy as np
import matplotlib.pyplot as plt
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy

## Create Multi-Agent Environment

**Simple Spread:**
- 3 agents (cooperative)
- 3 landmarks
- Goal: Each agent covers a different landmark
- Reward: Negative distance to nearest uncovered landmark

In [None]:
# Create base environment
env = simple_spread_v3.parallel_env(
    N=3,                    # Number of agents
    max_cycles=25,          # Episode length
    continuous_actions=False
)

print(f"Environment: Simple Spread")
print(f"  Number of agents: {env.num_agents}")
print(f"  Agents: {env.agents}")

# Reset and show observation space
observations, infos = env.reset(seed=42)
agent = env.agents[0]

print(f"\nAgent '{agent}':")
print(f"  Observation shape: {observations[agent].shape}")
print(f"  Action space: {env.action_space(agent)}")
print(f"    (5 actions: no-op, left, right, up, down)")

## Preprocess Environment

Use SuperSuit to convert parallel environment to single-agent format:

In [None]:
# Wrap environment for SB3
env = simple_spread_v3.parallel_env(N=3, max_cycles=25, continuous_actions=False)

# Convert to single-agent format (all agents controlled together)
env = ss.pettingzoo_env_to_vec_env_v1(env)
env = ss.concat_vec_envs_v1(env, num_vec_envs=1, num_cpus=1, base_class='stable_baselines3')

print("Environment wrapped for SB3")
print(f"  Observation space: {env.observation_space}")
print(f"  Action space: {env.action_space}")

## Train Agents (Independent PPO)

Each agent learns its own policy:

In [None]:
# Create PPO agent
model = PPO(
    MlpPolicy,
    env,
    learning_rate=1e-3,
    n_steps=2048,
    batch_size=64,
    n_epochs=10,
    gamma=0.99,
    verbose=1,
    seed=42
)

print("\nTraining agents (~5 minutes)...\n")
model.learn(total_timesteps=100_000)
print("\nTraining complete!")

## Evaluate Trained Agents

In [None]:
# Evaluate
def evaluate_multi_agent(model, n_episodes=20):
    """Evaluate multi-agent policy."""
    episode_rewards = []
    
    for episode in range(n_episodes):
        obs = env.reset()
        done = False
        episode_reward = 0
        
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            episode_reward += reward
        
        episode_rewards.append(episode_reward)
    
    return episode_rewards


rewards = evaluate_multi_agent(model, n_episodes=20)

print(f"\n{'='*50}")
print(f"Evaluation (20 episodes):")
print(f"{'='*50}")
print(f"  Mean reward: {np.mean(rewards):.2f} ± {np.std(rewards):.2f}")
print(f"  Min: {np.min(rewards):.2f}")
print(f"  Max: {np.max(rewards):.2f}")
print(f"{'='*50}")

# Higher (less negative) is better
if np.mean(rewards) > -30:
    print(f"\n✓ Good coordination! Agents spread out well.")
else:
    print(f"\n⚠ Agents need more training to coordinate.")

## Visualize Agent Behavior

In [None]:
# Record episode
raw_env = simple_spread_v3.parallel_env(
    N=3, max_cycles=25, 
    continuous_actions=False,
    render_mode='rgb_array'
)

observations, infos = raw_env.reset(seed=42)
frames = []
episode_reward = {agent: 0 for agent in raw_env.agents}

for step in range(25):  # Max steps
    # Get frame
    frames.append(raw_env.render())
    
    # Simple random policy for visualization
    actions = {agent: raw_env.action_space(agent).sample() for agent in raw_env.agents}
    
    observations, rewards, terminations, truncations, infos = raw_env.step(actions)
    
    for agent in raw_env.agents:
        episode_reward[agent] += rewards[agent]
    
    if all(terminations.values()) or all(truncations.values()):
        break

print(f"Episode rewards per agent:")
for agent, reward in episode_reward.items():
    print(f"  {agent}: {reward:.2f}")

# Show sample frames
if len(frames) >= 3:
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    indices = [0, len(frames)//2, -1]
    
    for i, idx in enumerate(indices):
        axes[i].imshow(frames[idx])
        axes[i].set_title(f"Step {idx if idx >= 0 else len(frames)+idx}")
        axes[i].axis('off')
    
    plt.suptitle("Multi-Agent Behavior (Blue=Agents, Black=Landmarks)")
    plt.tight_layout()
    plt.show()

print("\nNote: Circles are agents, colored squares are landmarks")
print("Goal: Each agent should cover a different landmark")

## Analyze Coordination

Measure how well agents spread out:

In [None]:
# Analyze distance between agents
def analyze_coordination(n_episodes=10):
    """Measure agent spread."""
    env_test = simple_spread_v3.parallel_env(N=3, max_cycles=25, continuous_actions=False)
    
    min_distances = []
    
    for _ in range(n_episodes):
        observations, _ = env_test.reset()
        
        for step in range(25):
            actions = {agent: env_test.action_space(agent).sample() for agent in env_test.agents}
            observations, rewards, terminations, truncations, infos = env_test.step(actions)
            
            # Calculate min distance between agents
            # (In real implementation, would extract positions from observations)
            
            if all(terminations.values()) or all(truncations.values()):
                break
    
    return min_distances

print("Coordination analysis:")
print("  (Higher rewards = better spread)")
print("  (Goal: Each agent near different landmark)")

## Next Steps

Improve multi-agent coordination:

1. **Parameter sharing:** Train single policy for all agents
2. **Communication:** Add message passing between agents
3. **Curriculum:** Start with 2 agents, increase to 3
4. **Competitive:** Try Simple Tag (predator-prey)
5. **Different algorithms:** MADDPG, QMIX, CommNet

See `project4_multi_agent_README.md` for detailed ideas!

**Note:** Multi-agent RL is an active research area with many open challenges!