# LBF

python3 -m venv lbf-env
source lbf-env/bin/activate
pip install -e lb-foraging/

python -m ipykernel install --user --name=lbf-env --display-name "Python (lb-foraging)"


In [3]:
!pip install lbforaging



In [2]:
import lbforaging 
import gymnasium as gym

In [11]:
env.action_space

Tuple(Discrete(6), Discrete(6))

In [6]:
env.observation_space

Tuple(Box([-1. -1.  0. -1. -1.  0. -1. -1.  0.], [7. 7. 4. 7. 7. 2. 7. 7. 2.], (9,), float32), Box([-1. -1.  0. -1. -1.  0. -1. -1.  0.], [7. 7. 4. 7. 7. 2. 7. 7. 2.], (9,), float32))

In [28]:
num_agents = env.unwrapped.n_agents
print(f'Number of agents: {num_agents}')

Number of agents: 2


In [None]:
import lbforaging 
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque, namedtuple
import matplotlib.pyplot as plt
import wandb

# ---------------- WandB setup ----------------
run = wandb.init(
    project="LBF",
    config={
        "env_name": "Foraging-8x8-2p-4f-v3",
    },
    sync_tensorboard=True,
    save_code=True,
)

# ---------------- Environment ----------------
env_conf = "Foraging-8x8-2p-4f-v3"
env = gym.make(env_conf)
'''env = gym.make("Foraging<obs>-<x_size>x<y_size>-<n_agents>p-<food>f<force_c>-v1")
    • <obs>: This optional field can either be empty ("") or indicate a partially observable task
    with visibility radius of two fields ("-2s).
    • <x_size>: This field indicates the horizontal size of the environment map and can by
    default take any values between 5 and 20.
    • <y_size>: This field indicates the vertical size of the environment map and can by default
    take any values between 5 and 20. It should be noted, that upon import only environments
    with square dimensions (<x_size> = <y_size>) are registered and ready for creation.
    • <n_agents>: This field indicates the number of agents within the environment. By default,
    any values between 2 and 5 are automatically registered.
    • <food>: This field indicates the number of food items scattered within the environment. It
    can take any values between 1 and 10 by default.
    • <force_c>: This optional field can either be empty ("") or indicate a task with only
    "cooperative food" ("-coop". In the latter case, the environment will only contain food of a
    level such that all agents have to cooperate in order to pick the food up. This mode should
    only be used with up to four agents.'''

# ---------------- Hyperparameters ----------------
LR = 1e-3
MEMORY_SIZE = 50000
MAX_EPISODES = 20000
EPSILON_START = 1.0
EPSILON_DECAY = 0.99
EPSILON_MIN = 0.05
GAMMA = 0.99
BATCH_SIZE = 64
BURN_IN = 500
DEVICE = 'cpu'  # 'cuda' if GPU available

# ---------------- DQN Network ----------------
class DQN(nn.Module):
    def __init__(self, obs_dim, act_dim, lr=LR, device=DEVICE):
        super(DQN, self).__init__()
        self.device = device
        self.model = nn.Sequential(
            nn.Linear(obs_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, act_dim)
        )
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        if device == 'cuda':
            self.model.cuda()
        
    def forward(self, x):
        x = torch.FloatTensor(x).to(self.device)
        return self.model(x)
    
    def get_action(self, state, epsilon=0.05):
        if np.random.random() < epsilon:
            return np.random.randint(self.model[-1].out_features)
        else:
            qvals = self.forward(state)
            return torch.argmax(qvals).item()

# ---------------- Replay Buffer ----------------
class ReplayBuffer:
    def __init__(self, capacity=MEMORY_SIZE):
        self.buffer = deque(maxlen=capacity)
        self.transition = namedtuple('Transition', ['state', 'action', 'reward', 'done', 'next_state'])
    
    def append(self, state, action, reward, done, next_state):
        self.buffer.append(self.transition(state, action, reward, done, next_state))
    
    def sample(self, batch_size):
        idxs = np.random.choice(len(self.buffer), batch_size, replace=False)
        batch = [self.buffer[i] for i in idxs]
        return zip(*batch)
    
    def __len__(self):
        return len(self.buffer)

# ---------------- Independent Q-Learning Agent ----------------
class IQLAgent:
    def __init__(self, obs_dim, act_dim, device=DEVICE, lr=LR, gamma=GAMMA, epsilon=EPSILON_START, eps_decay=EPSILON_DECAY):
        self.qnet = DQN(obs_dim, act_dim, lr, device)
        self.buffer = ReplayBuffer()
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_decay = eps_decay
    
    def store_transition(self, state, action, reward, done, next_state):
        self.buffer.append(state, action, reward, done, next_state)
    
    def update(self, batch_size=BATCH_SIZE):
        if len(self.buffer) < batch_size:
            return
        states, actions, rewards, dones, next_states = self.buffer.sample(batch_size)
        states = torch.FloatTensor(states).to(self.qnet.device)
        actions = torch.LongTensor(actions).unsqueeze(-1).to(self.qnet.device)
        rewards = torch.FloatTensor(rewards).to(self.qnet.device)
        dones = torch.BoolTensor(dones).to(self.qnet.device)
        next_states = torch.FloatTensor(next_states).to(self.qnet.device)
        
        qvals = self.qnet(states).gather(1, actions)
        q_next = self.qnet(next_states).max(dim=1)[0].detach()
        q_next[dones] = 0
        target = rewards + self.gamma * q_next
        
        loss = nn.MSELoss()(qvals.squeeze(), target)
        self.qnet.optimizer.zero_grad()
        loss.backward()
        self.qnet.optimizer.step()

# ---------------- Multi-Agent Training Loop ----------------
def train_iql(env, n_episodes=MAX_EPISODES, device=DEVICE):
    num_agents = env.unwrapped.n_agents
    agents = []
    for i in range(num_agents):
        obs_dim = env.observation_space[i].shape[0]
        act_dim = env.action_space[i].n
        agents.append(IQLAgent(obs_dim, act_dim, device=device))
    
    rewards_history = [[] for _ in range(num_agents)]

    # Optional burn-in
    print("Burn-in buffer with random actions...")
    steps = 0
    
    while steps < BURN_IN:
        print('new episode bc done')
        obs = env.reset()
        done = False
        while not done:
            actions = [env.action_space[i].sample() if not done else 0 for i in range(num_agents)]
            print(actions, steps)
            next_obs, rewards, done, info, __ = env.step(actions)
            for i, agent in enumerate(agents):
                agent.store_transition(obs[i], actions[i], rewards[i], done, next_obs[i])
            obs = next_obs
            steps += 1
    print("Burn-in complete.\nStarting training...")

    # Main training loop
    for ep in range(1, n_episodes + 1):
        obs = env.reset()
        done = [False] * num_agents
        total_rewards = [0] * num_agents

        while not done:
            # Select actions
            actions = []
            for i, agent in enumerate(agents):
                if not done:
                    a = agent.qnet.get_action(obs[i], epsilon=agent.epsilon)
                else:
                    a = 0
                actions.append(a)

            # Step environment
            next_obs, rewards, done, infos, __ = env.step(actions)

            # Store transitions & train agents
            for i, agent in enumerate(agents):
                agent.store_transition(obs[i], actions[i], rewards[i], done, next_obs[i])
                agent.update()
                total_rewards[i] += rewards[i]

            obs = next_obs

        # Decay epsilon
        for agent in agents:
            agent.epsilon = max(EPSILON_MIN, agent.epsilon * agent.eps_decay)

        # Logging
        if ep % 10 == 0:
            avg_rewards = [np.mean(rewards_history[i][-100:] + [total_rewards[i]]) for i in range(num_agents)]
            print(f"Episode {ep}, Avg rewards: {avg_rewards}")
            for i, avg in enumerate(avg_rewards):
                wandb.log({f"agent_{i}_avg_reward": avg}, step=ep)

        for i in range(num_agents):
            rewards_history[i].append(total_rewards[i])

    return agents, rewards_history

# ---------------- Train ----------------
agents, rewards_history = train_iql(env, n_episodes=MAX_EPISODES, device=DEVICE)

# ---------------- Plot learning curves ----------------
plt.figure(figsize=(10, 5))
for i in range(env.unwrapped.n_agents):
    plt.plot(rewards_history[i], label=f"Agent {i}")
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("Learning Curves")
plt.legend()
plt.show()

# ---------------- Cleanup ----------------
wandb.finish()
env.close()


wandb: ERROR Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: marionapla to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


<OrderEnforcing<PassiveEnvChecker<ForagingEnv<Foraging-8x8-2p-4f-v3>>>>


'\n# ---------------- Hyperparameters ----------------\nLR = 1e-3\nMEMORY_SIZE = 50000\nMAX_EPISODES = 20000\nEPSILON_START = 1.0\nEPSILON_DECAY = 0.99\nEPSILON_MIN = 0.05\nGAMMA = 0.99\nBATCH_SIZE = 64\nBURN_IN = 500\nDEVICE = \'cpu\'  # \'cuda\' if GPU available\n\n# ---------------- DQN Network ----------------\nclass DQN(nn.Module):\n    def __init__(self, obs_dim, act_dim, lr=LR, device=DEVICE):\n        super(DQN, self).__init__()\n        self.device = device\n        self.model = nn.Sequential(\n            nn.Linear(obs_dim, 128),\n            nn.ReLU(),\n            nn.Linear(128, 128),\n            nn.ReLU(),\n            nn.Linear(128, act_dim)\n        )\n        self.optimizer = optim.Adam(self.parameters(), lr=lr)\n        if device == \'cuda\':\n            self.model.cuda()\n        \n    def forward(self, x):\n        x = torch.FloatTensor(x).to(self.device)\n        return self.model(x)\n    \n    def get_action(self, state, epsilon=0.05):\n        if np.random.rando