# LBF

python3 -m venv lbf-env
source lbf-env/bin/activate
pip install -e lb-foraging/

python -m ipykernel install --user --name=lbf-env --display-name "Python (lb-foraging)"


In [3]:
!pip install lbforaging



In [11]:
env.action_space

Tuple(Discrete(6), Discrete(6))

In [6]:
env.observation_space

Tuple(Box([-1. -1.  0. -1. -1.  0. -1. -1.  0.], [7. 7. 4. 7. 7. 2. 7. 7. 2.], (9,), float32), Box([-1. -1.  0. -1. -1.  0. -1. -1.  0.], [7. 7. 4. 7. 7. 2. 7. 7. 2.], (9,), float32))

In [28]:
num_agents = env.unwrapped.n_agents
print(f'Number of agents: {num_agents}')

Number of agents: 2


In [None]:
import lbforaging 
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque, namedtuple
import matplotlib.pyplot as plt
import wandb

# ---------------- WandB setup ----------------
run = wandb.init(
    project="LBF",
    config={
        "env_name": "Foraging-8x8-2p-4f-v3",
    },
    sync_tensorboard=True,
    save_code=True,
)

# ---------------- Environment ----------------
env_conf = "Foraging-8x8-2p-4f-v3"
env = gym.make(env_conf)
'''env = gym.make("Foraging<obs>-<x_size>x<y_size>-<n_agents>p-<food>f<force_c>-v1")
    • <obs>: This optional field can either be empty ("") or indicate a partially observable task
    with visibility radius of two fields ("-2s).
    • <x_size>: This field indicates the horizontal size of the environment map and can by
    default take any values between 5 and 20.
    • <y_size>: This field indicates the vertical size of the environment map and can by default
    take any values between 5 and 20. It should be noted, that upon import only environments
    with square dimensions (<x_size> = <y_size>) are registered and ready for creation.
    • <n_agents>: This field indicates the number of agents within the environment. By default,
    any values between 2 and 5 are automatically registered.
    • <food>: This field indicates the number of food items scattered within the environment. It
    can take any values between 1 and 10 by default.
    • <force_c>: This optional field can either be empty ("") or indicate a task with only
    "cooperative food" ("-coop". In the latter case, the environment will only contain food of a
    level such that all agents have to cooperate in order to pick the food up. This mode should
    only be used with up to four agents.'''

# ---------------- Hyperparameters ----------------
LR = 1e-3
MEMORY_SIZE = 5000
MAX_EPISODES = 20000
EPSILON_START = 1.0
EPSILON_DECAY = 0.995
EPSILON_MIN = 0.05
GAMMA = 0.99
BATCH_SIZE = 64
BURN_IN = 1000
DEVICE = 'cpu'  # 'cuda' if GPU available

# ---------------- DQN Network ----------------
class DQN(nn.Module):
    def __init__(self, obs_dim, act_dim, lr=LR, device=DEVICE):
        super(DQN, self).__init__()
        self.device = device
        self.model = nn.Sequential(
            nn.Linear(obs_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, act_dim)
        )
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        if device == 'cuda':
            self.model.cuda()
        
    def forward(self, x):
        x = torch.FloatTensor(x).to(self.device)
        if x.ndim > 2:
            x = x.view(x.size(0), -1)  # flatten multi-dim observations
        return self.model(x)
    
    def get_action(self, state, epsilon=0.05):
        if np.random.random() < epsilon:
            return np.random.randint(self.model[-1].out_features)
        else:
            qvals = self.forward(state)
            print(f'qvals: {qvals}  ')    #########
            return torch.argmax(qvals).item()

# ---------------- Replay Buffer ----------------
class ReplayBuffer:
    def __init__(self, capacity=MEMORY_SIZE):
        self.buffer = deque(maxlen=capacity)
        self.transition = namedtuple('Transition', ['state', 'action', 'reward', 'done', 'next_state'])
    
    def append(self, state, action, reward, done, next_state):
        self.buffer.append(self.transition(state, action, reward, done, next_state))
    
    def sample(self, batch_size):
        idxs = np.random.choice(len(self.buffer), batch_size, replace=False)
        batch = [self.buffer[i] for i in idxs]
        return zip(*batch)
    
    def __len__(self):
        return len(self.buffer)

# ---------------- Independent Q-Learning Agent ----------------
class IQLAgent:
    def __init__(self, obs_dim, act_dim, device=DEVICE, lr=LR, gamma=GAMMA, epsilon=EPSILON_START, eps_decay=EPSILON_DECAY):
        self.qnet = DQN(obs_dim, act_dim, lr, device)
        self.buffer = ReplayBuffer()
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_decay = eps_decay
    
    def store_transition(self, state, action, reward, done, next_state):
        self.buffer.append(state, action, reward, done, next_state)
    
    def update(self, batch_size=BATCH_SIZE):
        if len(self.buffer) < batch_size:
            return
        states, actions, rewards, dones, next_states = self.buffer.sample(batch_size)
        states = torch.FloatTensor(states).to(self.qnet.device)
        actions = torch.LongTensor(actions).unsqueeze(-1).to(self.qnet.device)
        rewards = torch.FloatTensor(rewards).to(self.qnet.device)
        dones = torch.BoolTensor(dones).to(self.qnet.device)
        next_states = torch.FloatTensor(next_states).to(self.qnet.device)
        
        qvals = self.qnet(states).gather(1, actions)
        q_next = self.qnet(next_states).max(dim=1)[0].detach()
        q_next[dones] = 0
        target = rewards + self.gamma * q_next
        
        loss = nn.MSELoss()(qvals.squeeze(), target)
        self.qnet.optimizer.zero_grad()
        loss.backward()
        self.qnet.optimizer.step()

# ---------------- Multi-Agent Training Loop ----------------
def train_iql(env, n_episodes=MAX_EPISODES, device=DEVICE):
    num_agents = env.unwrapped.n_agents
    agents = []
    for i in range(num_agents):
        obs_dim = np.prod(env.observation_space[i].shape)
        act_dim = env.action_space[i].n
        agents.append(IQLAgent(obs_dim, act_dim, device=device))
        print(f"Initialized Agent {i}: obs_dim={obs_dim}, act_dim={act_dim}")
    
    rewards_history = [[] for _ in range(num_agents)]

    # ----------- Burn-in Phase -----------
    print("Burn-in buffer with random actions...")
    steps = 0
    while steps < BURN_IN:
        print(f"Burn-in steps: {steps}/{BURN_IN}", end='\r')
        obs, _ = env.reset()
        done = False
        while not done:
            actions = [env.action_space[i].sample() for i in range(num_agents)]
            print(f"Actions taken: {actions}", end='\r')
            next_obs, rewards, terminated, truncated, infos = env.step(actions)
            done = np.any(terminated) or np.any(truncated)
            for i, agent in enumerate(agents):
                agent.store_transition(obs[i].flatten(), actions[i], rewards[i], done, next_obs[i].flatten())
            obs = next_obs
            steps += 1
    print("Burn-in complete.\nStarting training...")

    # ----------- Main Training Loop -----------
    for ep in range(1, n_episodes + 1):
        obs, _ = env.reset()
        done = False
        total_rewards = [0] * num_agents

        while not done:
            # Select actions
            actions = []
            for i, agent in enumerate(agents):
                a = agent.qnet.get_action(obs[i].flatten(), epsilon=agent.epsilon)
                actions.append(a)

            # Step environment
            next_obs, rewards, terminated, truncated, infos = env.step(actions)
            done = np.any(terminated) or np.any(truncated)

            # Store transitions & train agents
            for i, agent in enumerate(agents):
                agent.store_transition(obs[i].flatten(), actions[i], rewards[i], done, next_obs[i].flatten())
                agent.update()
                total_rewards[i] += rewards[i]

            obs = next_obs

        # Decay epsilon
        for agent in agents:
            agent.epsilon = max(EPSILON_MIN, agent.epsilon * agent.eps_decay)

        # Logging
        if ep % 10 == 0:
            avg_rewards = [np.mean(rewards_history[i][-100:] + [total_rewards[i]]) for i in range(num_agents)]
            print(f"Episode {ep}, Avg rewards: {avg_rewards}")
            for i, avg in enumerate(avg_rewards):
                wandb.log({f"agent_{i}_avg_reward": avg}, step=ep)
                wandb.log({f"agent_{i}_epsilon": agents[i].epsilon}, step=ep)
        
        for i in range(num_agents):
            wandb.log({f"agent_{i}_total_reward": total_rewards[i]}, step=ep)

        
        import os
        if ep % 1000 == 0:
            save_dir = "./checkpoints"
            os.makedirs(save_dir, exist_ok=True)  #
            for i, agent in enumerate(agents):
                save_path = os.path.join(save_dir, f"agent_{i}_ep{ep}.pt")
                torch.save(agent.qnet.state_dict(), save_path)
                print(f"Saved checkpoint: {save_path}")


        for i in range(num_agents):
            rewards_history[i].append(total_rewards[i])

    return agents, rewards_history

# ---------------- Train ----------------
agents, rewards_history = train_iql(env, n_episodes=MAX_EPISODES, device=DEVICE)

# ---------------- Plot learning curves ----------------
# plt.figure(figsize=(10, 5))
# for i in range(env.unwrapped.n_agents):
#     plt.plot(rewards_history[i], label=f"Agent {i}")
# plt.xlabel("Episode")
# plt.ylabel("Total Reward")
# plt.title("Learning Curves")
# plt.legend()
# plt.show()

# ---------------- Cleanup ----------------
wandb.finish()
env.close()


wandb: ERROR Unable to save notebook session history.


Initialized Agent 0: obs_dim=18, act_dim=6
Initialized Agent 1: obs_dim=18, act_dim=6
Burn-in buffer with random actions...
Actions taken: [4, 3]

  logger.warn(


Burn-in complete., 4]00
Starting training...
6
6


  dones = torch.BoolTensor(dones).to(self.qnet.device)


6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
tensor([104.9944,  98.1328,  98.2259, 102.1938, 102.8031, 102.6620],
       grad_fn=<ViewBackward0>)
6
6
6
tensor([103.0663,  97.9787, 100.5698, 105.9366, 100.8547, 106.3618],
       grad_fn=<ViewBackward0>)
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
tensor([122.7260, 133.5234, 126.5396, 124.1794, 131.5818, 123.7723],
       grad_fn=<ViewBackward0>)
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6

KeyboardInterrupt: 

In [21]:
# ===========================
# Imports and setup
# ===========================
import lbforaging
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque, namedtuple
import matplotlib.pyplot as plt
import wandb

# ===========================
# Environment setup
# ===========================
env_conf = "Foraging-8x8-2p-4f-v3"
env = gym.make(env_conf)


# ===========================
# Hyperparameters
# ===========================
LR = 1e-3
MEMORY_SIZE = 5000
MAX_EPISODES = 5000
EPSILON_START = 1.0
EPSILON_DECAY = 0.999
EPSILON_MIN = 0.05
GAMMA = 0.99
BATCH_SIZE = 64
BURN_IN = 1000
DEVICE = 'cpu'

# ===========================
# WandB initialization
# ===========================
run = wandb.init(
    project="LBF_proves",
    config={
        "env_name": env_conf,
        'learning_rate': LR,
        'memory_size': MEMORY_SIZE,
        'max_episodes': MAX_EPISODES,
        'epsilon_start': EPSILON_START,
        'epsilon_decay': EPSILON_DECAY,
        'epsilon_min': EPSILON_MIN,
        'gamma': GAMMA,
        'batch_size': BATCH_SIZE,
        'burn_in': BURN_IN,
        
    },
    sync_tensorboard=True,
    save_code=True,
)

'''config={
    "model": '2-step_DQN',
    "learning_rate": lr,
    "gamma": GAMMA,
    "MAX_EPISODES": MAX_EPISODES,
    "EPSILON_DECAY": EPSILON_DECAY, 
    "batch_size": BATCH_SIZE
})'''

# ===========================
# Deep Q-Network (DQN)
# ===========================
class DQN(nn.Module):
    """A simple 3-layer fully connected Deep Q-Network."""
    def __init__(self, obs_dim, act_dim, lr=LR, device=DEVICE):
        super(DQN, self).__init__()
        self.device = device

        self.model = nn.Sequential(
            nn.Linear(obs_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, act_dim)
        )

        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        if device == 'cuda':
            self.model.cuda()
        
    def forward(self, x):
        x = torch.FloatTensor(x).to(self.device)
        if x.ndim > 2:
            x = x.view(x.size(0), -1)
        return self.model(x)
    
    def get_action(self, state, epsilon=0.05):
        if np.random.random() < epsilon:
            return np.random.randint(self.model[-1].out_features)
        else:
            qvals = self.forward(state)
            return torch.argmax(qvals).item()

# ===========================
# Replay Buffer
# ===========================
class ReplayBuffer:
    """Stores past experiences for experience replay."""
    def __init__(self, capacity=MEMORY_SIZE):
        self.buffer = deque(maxlen=capacity)
        self.transition = namedtuple('Transition', ['state', 'action', 'reward', 'done', 'next_state'])
    
    def append(self, state, action, reward, done, next_state):
        self.buffer.append(self.transition(state, action, reward, done, next_state))
    
    def sample(self, batch_size):
        idxs = np.random.choice(len(self.buffer), batch_size, replace=False)
        batch = [self.buffer[i] for i in idxs]
        return zip(*batch)
    
    def __len__(self):
        return len(self.buffer)

# ===========================
# Independent Q-Learning Agent
# ===========================
class IQLAgent:
    """Independent Q-Learning agent for multi-agent environments."""
    def __init__(self, obs_dim, act_dim, device=DEVICE, lr=LR, gamma=GAMMA, epsilon=EPSILON_START, eps_decay=EPSILON_DECAY):
        self.qnet = DQN(obs_dim, act_dim, lr, device)
        self.buffer = ReplayBuffer()
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_decay = eps_decay
        self.last_loss = None  # Store last loss for logging
    
    def store_transition(self, state, action, reward, done, next_state):
        self.buffer.append(state, action, reward, done, next_state)
    
    def update(self, batch_size=BATCH_SIZE):
        if len(self.buffer) < batch_size:
            return
        states, actions, rewards, dones, next_states = self.buffer.sample(batch_size)
        states = torch.FloatTensor(states).to(self.qnet.device)
        actions = torch.LongTensor(actions).unsqueeze(-1).to(self.qnet.device)
        rewards = torch.FloatTensor(rewards).to(self.qnet.device)
        dones = torch.BoolTensor(dones).to(self.qnet.device)
        next_states = torch.FloatTensor(next_states).to(self.qnet.device)
        
        qvals = self.qnet(states).gather(1, actions)
        q_next = self.qnet(next_states).max(dim=1)[0].detach()
        q_next[dones] = 0
        target = rewards + self.gamma * q_next
        
        loss = nn.MSELoss()(qvals.squeeze(), target)
        self.qnet.optimizer.zero_grad()
        loss.backward()
        self.qnet.optimizer.step()

        # Store last loss
        self.last_loss = loss.item()

# ===========================
# Multi-Agent IQL Training Loop
# ===========================
def train_iql(env, n_episodes=MAX_EPISODES, device=DEVICE):
    num_agents = env.unwrapped.n_agents
    agents = []

    for i in range(num_agents):
        obs_dim = np.prod(env.observation_space[i].shape)
        act_dim = env.action_space[i].n
        agents.append(IQLAgent(obs_dim, act_dim, device=device))
        print(f"Initialized Agent {i}: obs_dim={obs_dim}, act_dim={act_dim}")
    
    rewards_history = [[] for _ in range(num_agents)]

    # ----------- Burn-in Phase -----------
    print("Burn-in buffer with random actions...")
    steps = 0
    while steps < BURN_IN:
        print(f"Burn-in steps: {steps}/{BURN_IN}", end='\r')
        obs, _ = env.reset()
        done = False
        while not done:
            actions = [env.action_space[i].sample() for i in range(num_agents)]
            next_obs, rewards, terminated, truncated, infos = env.step(actions)
            done = np.any(terminated) or np.any(truncated)

            for i, agent in enumerate(agents):
                agent.store_transition(obs[i].flatten(), actions[i], rewards[i], done, next_obs[i].flatten())
            
            obs = next_obs
            steps += 1
    print("Burn-in complete.\nStarting training...")

    total_env_steps = 0
    # ----------- Main Training Loop -----------
    for ep in range(1, n_episodes + 1):
        obs, _ = env.reset()
        done = False
        total_rewards = [0] * num_agents

        while not done:
            actions = []
            for i, agent in enumerate(agents):
                a = agent.qnet.get_action(obs[i].flatten(), epsilon=agent.epsilon)
                actions.append(a)

            next_obs, rewards, terminated, truncated, infos = env.step(actions)
            done = np.any(terminated) or np.any(truncated)
            total_env_steps += 1

            for i, agent in enumerate(agents):
                agent.store_transition(obs[i].flatten(), actions[i], rewards[i], done, next_obs[i].flatten())
                agent.update()
                total_rewards[i] += rewards[i]

            obs = next_obs

        # Decay epsilon
        for agent in agents:
            agent.epsilon = max(EPSILON_MIN, agent.epsilon * agent.eps_decay)

        # -------- Logging --------
        avg_rewards = [np.mean(rewards_history[i][-100:] + [total_rewards[i]]) for i in range(num_agents)]
        if ep % 10 == 0:
            print(f"Episode {ep}, Avg rewards: {avg_rewards}")

        for i in range(num_agents):
            wandb.log({f"agent_{i}_loss": agent.last_loss}, step=total_env_steps)

        for i in range(num_agents):
            wandb.log({
                f"agent_{i}_total_reward": total_rewards[i],
                f"agent_{i}_avg_reward": avg_rewards[i],
                f"agent_{i}_epsilon": agents[i].epsilon,
                # f"agent_{i}_loss": agents[i].last_loss if agents[i].last_loss is not None else 0.0
            }, step=ep)

        # -------- Save model checkpoints every 1000 episodes --------
        import os
        if ep % 100 == 0:
            save_dir = "./checkpoints"
            os.makedirs(save_dir, exist_ok=True)
            for i, agent in enumerate(agents):
                save_path = os.path.join(save_dir, f"agent_{i}_ep{ep}.pt")
                torch.save(agent.qnet.state_dict(), save_path)
                print(f"Saved checkpoint: {save_path}")

        for i in range(num_agents):
            rewards_history[i].append(total_rewards[i])

    return agents, rewards_history

# ===========================
# Run training
# ===========================
agents, rewards_history = train_iql(env, n_episodes=MAX_EPISODES, device=DEVICE)

# ===========================
# Optional: Plot learning curves
# ===========================
# plt.figure(figsize=(10, 5))
# for i in range(env.unwrapped.n_agents):
#     plt.plot(rewards_history[i], label=f"Agent {i}")
# plt.xlabel("Episode")
# plt.ylabel("Total Reward")
# plt.title("Learning Curves")
# plt.legend()
# plt.show()

# ===========================
# Cleanup
# ===========================
wandb.finish()
env.close()


wandb: ERROR Unable to save notebook session history.


0,1
agent_0_avg_reward,▇▇▇██▄▄▄▄▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▂▂▂▂▂▂▂▂▂
agent_0_epsilon,██▇▇▇▅▄▄▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
agent_0_loss,▅▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
agent_0_total_reward,▁▁▆▇▆▅▇▁▅▁▁▁▅▁▄▁▄▁▁█▁▆▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
agent_1_avg_reward,▇▇▇▇█▇▅▄▄▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▂▂▂▂▂▃▄▄▃▃▂▃▃▄▃▃
agent_1_epsilon,██▇▇▇▇▆▅▄▄▄▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
agent_1_loss,▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
agent_1_total_reward,▁▆▆▆▁▁▁▁▆▁▅▁▁▁▁▁▁▁▁▁▁▁▁▆▁▁▁▁▁▁▃▃▄▁▁▁▅▁▁█

0,1
agent_0_avg_reward,0.02881
agent_0_epsilon,0.05
agent_0_loss,0.00032
agent_0_total_reward,0.0
agent_1_avg_reward,0.04753
agent_1_epsilon,0.05
agent_1_loss,0.00032
agent_1_total_reward,0.0


Initialized Agent 0: obs_dim=18, act_dim=6
Initialized Agent 1: obs_dim=18, act_dim=6
Burn-in buffer with random actions...
Burn-in steps: 0/1000

  logger.warn(


Burn-in complete.0/1000
Starting training...


  dones = torch.BoolTensor(dones).to(self.qnet.device)


Episode 10, Avg rewards: [0.06877344877344878, 0.07833333333333334]
Episode 20, Avg rewards: [0.07980339105339104, 0.07310897435897436]
Episode 30, Avg rewards: [0.07354858104858104, 0.11241896991896991]
Episode 40, Avg rewards: [0.07852453102453102, 0.10073287823287824]
Episode 50, Avg rewards: [0.08984343434343434, 0.08652036852036851]
Episode 60, Avg rewards: [0.08931397306397307, 0.09626697376697377]
Episode 70, Avg rewards: [0.09191197691197692, 0.08763978085406658]
Episode 80, Avg rewards: [0.09418380230880231, 0.08517038517038517]
Episode 90, Avg rewards: [0.09615720699054031, 0.0840235073568407]
Episode 100, Avg rewards: [0.09112481962481961, 0.08262115662115661]
Saved checkpoint: ./checkpoints\agent_0_ep100.pt
Saved checkpoint: ./checkpoints\agent_1_ep100.pt
Episode 110, Avg rewards: [0.09311788321689311, 0.08639608466341141]
Episode 120, Avg rewards: [0.09793086451502292, 0.08627549568143628]
Episode 130, Avg rewards: [0.09848413412769848, 0.07918692968197918]
Episode 140, Av

0,1
agent_0_avg_reward,████▆▅▄▃▄▄▃▂▁▃▃▃▃▃▂▂▂▂▂▁▁▂▂▂▁▂▂▁▁▂▃▃▃▃▃▂
agent_0_epsilon,██▇▇▆▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
agent_0_loss,█▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▇▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
agent_0_total_reward,▁▁▅▁▁█▅▃▁▅▆▆▁▁▁▁▁▁▁▁▁▄▁▁▁▁▄▁▁▁▁▁▆▁▁▁▁▁▁▅
agent_1_avg_reward,▇▆▆▅▇▄▄█▆▄▆▅▅▄▃▄▄▃▆▃▂▂▅▅▁▁▂▂▄▄▄▄▅▃▃▃▃▃▄▄
agent_1_epsilon,█▆▅▅▄▄▄▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
agent_1_loss,█▃▁▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
agent_1_total_reward,▁▃▁▅▅▁▁▅▁█▁▁▁▆▁▁▇▁▁▃▁▁▁▁▁▁▁▁▁▁▁▁▅▃▁▁▁▁▁▆

0,1
agent_0_avg_reward,0.02891
agent_0_epsilon,0.05
agent_0_loss,0.00047
agent_0_total_reward,0.0
agent_1_avg_reward,0.04125
agent_1_epsilon,0.05
agent_1_loss,0.00047
agent_1_total_reward,0.0


In [22]:
import torch

# Number of agents
num_agents = env.unwrapped.n_agents

# Observation and action dimensions for each agent
agents = []
for i in range(num_agents):
    obs_dim = np.prod(env.observation_space[i].shape)
    act_dim = env.action_space[i].n
    agent = IQLAgent(obs_dim, act_dim, device='cpu')  # device can be 'cuda'
    
    # Load the trained model
    checkpoint_path = f"./checkpoints/agent_{i}_ep5000.pt"
    agent.qnet.load_state_dict(torch.load(checkpoint_path, map_location='cpu'))
    
    agents.append(agent)

obs, _ = env.reset()
done = False
total_rewards = [0] * num_agents

while not done:
    actions = []
    for i, agent in enumerate(agents):
        # ε = 0 → fully greedy actions
        action = agent.qnet.get_action(obs[i].flatten(), epsilon=0.0)
        actions.append(action)
        
    print(actions)
    
    next_obs, rewards, terminated, truncated, infos = env.step(actions)
    done = np.any(terminated) or np.any(truncated)

    print(next_obs)
    # Accumulate rewards
    for i in range(num_agents):
        total_rewards[i] += rewards[i]

    obs = next_obs

print(f"Total rewards for this episode: {total_rewards}")



[0, 1]
(array([1., 1., 1., 1., 4., 2., 4., 3., 1., 6., 2., 1., 0., 2., 1., 2., 1.,
       1.], dtype=float32), array([1., 1., 1., 1., 4., 2., 4., 3., 1., 6., 2., 1., 2., 1., 1., 0., 2.,
       1.], dtype=float32))
[0, 1]
(array([1., 1., 1., 1., 4., 2., 4., 3., 1., 6., 2., 1., 0., 2., 1., 2., 1.,
       1.], dtype=float32), array([1., 1., 1., 1., 4., 2., 4., 3., 1., 6., 2., 1., 2., 1., 1., 0., 2.,
       1.], dtype=float32))
[0, 1]
(array([1., 1., 1., 1., 4., 2., 4., 3., 1., 6., 2., 1., 0., 2., 1., 2., 1.,
       1.], dtype=float32), array([1., 1., 1., 1., 4., 2., 4., 3., 1., 6., 2., 1., 2., 1., 1., 0., 2.,
       1.], dtype=float32))
[0, 1]
(array([1., 1., 1., 1., 4., 2., 4., 3., 1., 6., 2., 1., 0., 2., 1., 2., 1.,
       1.], dtype=float32), array([1., 1., 1., 1., 4., 2., 4., 3., 1., 6., 2., 1., 2., 1., 1., 0., 2.,
       1.], dtype=float32))
[0, 1]
(array([1., 1., 1., 1., 4., 2., 4., 3., 1., 6., 2., 1., 0., 2., 1., 2., 1.,
       1.], dtype=float32), array([1., 1., 1., 1., 4., 2., 4.

In [27]:
N = 10
avg_rewards = np.zeros(num_agents)

for ep in range(N):
    obs, _ = env.reset()
    done = False
    ep_rewards = np.zeros(num_agents)
    
    while not done:
        actions = [agent.qnet.get_action(obs[i].flatten(), epsilon=0.0) for i, agent in enumerate(agents)]
        obs, rewards, terminated, truncated, infos = env.step(actions)
        print(actions)
        done = np.any(terminated) or np.any(truncated)
        ep_rewards += rewards
    
    avg_rewards += ep_rewards

avg_rewards /= N
print(f"Average rewards over {N} episodes: {avg_rewards}")


[3, 1]
[4, 1]
[1, 1]
[1, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[2, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[0, 1]
[1, 1]
[1, 1]
[0, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]
[5, 1]