<a href="https://colab.research.google.com/github/mahault/Multi-agent-sustainability/blob/main/Multiagent_Sustainability_toy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Environment Setup


*   Grid Setup: The environment consists of a 3x3 grid.
*   Agent Dynamics: Two agents that can move, communicate, and consume resources.
*   Resource Dynamics: Water and food are placed randomly and can deplete and replenish.




In [54]:
!pip install pettingzoo



In [55]:
import torch
import torch.nn.functional as F
from torch import nn

class Actor(nn.Module):
    def __init__(self, obs_size, action_size):
        super(Actor, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, 128),
            nn.ReLU(),
            nn.Linear(128, action_size),
            nn.Tanh()  # Assuming actions are scaled between -1 and 1
        )

    def forward(self, x):
        return self.net(x)

class Critic(nn.Module):
    def __init__(self, obs_size, action_size):
        super(Critic, self).__init__()
        # Input is concatenation of obs and actions for all agents
        self.net = nn.Sequential(
            nn.Linear(obs_size + action_size, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x, actions):
        return self.net(torch.cat([x, actions], dim=1))


In [56]:
class MADDPGAgent:
    def __init__(self, obs_size, action_size, n_agents):
        self.actor = Actor(obs_size, action_size).float()  # Ensure the model is in float
        self.critic = Critic(obs_size * n_agents, action_size * n_agents).float()  # Adjust according to the total number of agents

        self.position = (0, 0)  # Example initial position
        self.water_timer = 3
        self.food_timer = 7
        # Initialize beliefs, etc.
        # Assume beliefs and other state variables are initialized similarly

        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=0.01)  # Example learning rate
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=0.01)  # Example learning rate

        self.target_actor = Actor(obs_size, action_size).float()
        self.target_critic = Critic(obs_size * n_agents, action_size * n_agents).float()

        # Initialize target networks with the same weights as the original networks
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

    def act(self, state):
        state = torch.FloatTensor(state).unsqueeze(0)
        action = self.actor(state)
        return action.detach().numpy().flatten()
    def update(self, batch, agent_id, agents, gamma=0.99, tau=0.01):
        states, actions, rewards, next_states, dones = batch

        # Get the number of agents
        n_agents = len(agents)

        # Get the current agent's state and action
        state = states[agent_id]
        action = actions[agent_id]

        # Get the next state for the current agent
        next_state = next_states[agent_id]

        # Get the target actions for the next states using the target actor networks
        next_actions = []
        for i in range(n_agents):
            next_action = agents[i].target_actor(next_states[i])
            next_actions.append(next_action)
        next_actions = torch.cat(next_actions, dim=1)

        # Compute the target Q-value using the target critic network
        target_q = agents[agent_id].target_critic(next_state, next_actions)
        target_q = rewards[agent_id] + (gamma * target_q * (1 - dones[agent_id]))

        # Compute the current Q-value using the critic network
        current_q = agents[agent_id].critic(state, actions)

        # Compute the critic loss using the TD error (Q-targets - Q-values)
        critic_loss = F.mse_loss(current_q, target_q)

        # Update the critic network by minimizing the critic loss
        agents[agent_id].critic_optimizer.zero_grad()
        critic_loss.backward()
        agents[agent_id].critic_optimizer.step()

        # Compute the actor loss using the critic network and the current actions
        actor_loss = -agents[agent_id].critic(state, actions).mean()

        # Update the actor network by minimizing the actor loss
        agents[agent_id].actor_optimizer.zero_grad()
        actor_loss.backward()
        agents[agent_id].actor_optimizer.step()

        # Perform soft updates of the target networks for both actor and critic
        for target_param, param in zip(agents[agent_id].target_actor.parameters(), agents[agent_id].actor.parameters()):
            target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data)

        for target_param, param in zip(agents[agent_id].target_critic.parameters(), agents[agent_id].critic.parameters()):
            target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data)

In [57]:
from pettingzoo import AECEnv
from pettingzoo.utils import agent_selector
from gym import spaces
import numpy as np

class ForagingEnv(AECEnv):
    metadata = {'render.modes': ['human']}

    def __init__(self):
        super().__init__()
        self.grid_size = 3
        self.n_agents = 2  # Number of agents
        self.observation_size = self.grid_size * self.grid_size * 3 + 2
        self.action_size = 6
        self.agents = [MADDPGAgent(self.observation_size, self.action_size, self.n_agents) for _ in range(self.n_agents)]
        self.agent_selector = agent_selector(self.agents)
        self.action_spaces = {agent: spaces.Discrete(6) for agent in self.agents}  # Add one for "communicate" action
        self.observation_spaces = {agent: spaces.Dict({
            "grid": spaces.Box(low=0, high=2, shape=(self.grid_size, self.grid_size, 3), dtype=np.float32),
            "state": spaces.Dict({
                "position": spaces.MultiDiscrete([self.grid_size, self.grid_size]),
                "water_timer": spaces.Discrete(3),  # Timesteps until death without water
                "food_timer": spaces.Discrete(7),  # Timesteps until death without food
                "beliefs": spaces.Dict({
                    "water_replenish_rate": spaces.Discrete(10),  # Example max rate
                    "food_replenish_rate": spaces.Discrete(10),
                })
            })
        }) for agent in self.agents}


        self.grid = np.zeros((self.grid_size, self.grid_size, 3), dtype=np.float32)  # Third dimension for agent presence, water, food
        self.resource_counters = {"water": [np.inf, 0], "food": [np.inf, 0]}  # [acquisitions left, replenishment timer]
        self.agent_states = {agent: {"position": None, "water_timer": 3, "food_timer": 7, "beliefs": {"water_replenish_rate": np.inf, "food_replenish_rate": np.inf}} for agent in self.agents}
        self.current_agent = None
        self.messages = {agent: "" for agent in self.agents}  # Initialize messages for each agent

    def reset(self):
        self.agent_selector.reinit(self.agents)
        self.current_agent = self.agent_selector.next()
        self.grid *= 0  # Clear the grid

        # Randomly place water and food, initialize resource counters
        water_position = np.random.choice(self.grid_size**2)
        food_position = np.random.choice(self.grid_size**2)
        while food_position == water_position:
            food_position = np.random.choice(self.grid_size**2)

        self.grid[water_position // self.grid_size, water_position % self.grid_size, 1] = 1
        self.grid[food_position // self.grid_size, food_position % self.grid_size, 2] = 1
        self.resource_counters["water"] = [np.random.randint(1, 5), np.random.randint(5, 15)]  # Random example values
        self.resource_counters["food"] = [np.random.randint(1, 5), np.random.randint(5, 15)]

        # Set initial positions for agents and reset their states
        for i, agent in enumerate(self.agents):
            while True:
                pos = np.random.choice(self.grid_size**2)
                if self.grid[pos // self.grid_size, pos % self.grid_size].sum() == 0:  # Ensure the position is empty
                    self.grid[pos // self.grid_size, pos % self.grid_size, 0] = i + 1  # Mark agent's presence
                    self.agent_states[agent]["position"] = (pos // self.grid_size, pos % self.grid_size)
                    self.agent_states[agent]["water_timer"] = 3
                    self.agent_states[agent]["food_timer"] = 7
                    break

    def step(self, action):
      self.messages = {agent: "" for agent in self.agents}  # Reset messages
      agent = self.current_agent
      reward = 0  # Initialize reward for the current step

      # Assume an initial penalty for moving to encourage efficient movement
      if action < 4:  # Movement actions
          move_success = self.move_agent(agent, action)
          reward -= 1 if move_success else 5  # Penalize more if the move isn't successful (e.g., walking into a wall)
      elif action == 4:  # Consumption action
          reward += self.consume_resources(agent)
      elif action == 5:  # Communication action
          communicated_successfully = self.communicate(agent)
          reward += 2 if communicated_successfully else -2  # Reward or penalize based on communication success

      # Update survival timers and check conditions
      self.agent_states[agent]["water_timer"] -= 1
      self.agent_states[agent]["food_timer"] -= 1
      if self.agent_states[agent]["water_timer"] <= 0 or self.agent_states[agent]["food_timer"] <= 0:
          self.terminate_agent(agent)
          reward -= 50  # Significant penalty for dying

      # small survival reward for each timestep the agent survives without taking any specific action
      reward += 0.1

      # Move to the next agent
      self.current_agent = self.agent_selector.next()

      return reward

    def move_agent(self, agent, direction):
        pos = self.agent_states[agent]["position"]
        move_success = False  # Assume move is unsuccessful by default
        if direction == 0:  # Up
            new_pos = (max(pos[0] - 1, 0), pos[1])
        elif direction == 1:  # Down
            new_pos = (min(pos[0] + 1, self.grid_size - 1), pos[1])
        elif direction == 2:  # Left
            new_pos = (pos[0], max(pos[1] - 1, 0))
        else:  # Right
            new_pos = (pos[0], min(pos[1] + 1, self.grid_size - 1))

        # Update position if the new position is not occupied
        if self.grid[new_pos[0], new_pos[1], 0] == 0:
            self.grid[pos[0], pos[1], 0] = 0  # Remove agent from old position
            self.grid[new_pos[0], new_pos[1], 0] = 1  # Add agent to new position
            self.agent_states[agent]["position"] = new_pos
            move_success = True  # The move was successful
        return move_success

    def consume_resources(self, agent):
      pos = self.agent_states[agent]["position"]
      reward = 0
      # Check for water
      if self.grid[pos[0], pos[1], 1] == 1:
          # Consume water if available
          if self.resource_counters["water"][0] > 0:  # If water is available
              self.resource_counters["water"][0] -= 1  # Decrement water availability
              self.agent_states[agent]["water_timer"] = 3  # Reset water timer
              reward += 10  # Reward for consuming water
              if self.resource_counters["water"][0] <= 0:  # If water is now depleted
                  self.grid[pos[0], pos[1], 1] = 0  # Remove water from the grid
      # Check for food
      elif self.grid[pos[0], pos[1], 2] == 1:
          # Consume food if available
          if self.resource_counters["food"][0] > 0:  # If food is available
              self.resource_counters["food"][0] -= 1  # Decrement food availability
              self.agent_states[agent]["food_timer"] = 7  # Reset food timer
              reward += 10  # Reward for consuming food
              if self.resource_counters["food"][0] <= 0:  # If food is now depleted
                  self.grid[pos[0], pos[1], 2] = 0  # Remove food from the grid
      return reward

    def communicate(self, sender):
      pos = self.agent_states[sender]["position"]
      water_timer = self.agent_states[sender]["water_timer"]
      food_timer = self.agent_states[sender]["food_timer"]
      water_belief = self.agent_states[sender]["beliefs"]["water_replenish_rate"]
      food_belief = self.agent_states[sender]["beliefs"]["food_replenish_rate"]

      # Determine the urgency level for water and food based on remaining timers
      water_urgency = "high" if water_timer <= 2 else "low"
      food_urgency = "high" if food_timer <= 3 else "low"

      message = {
          "location": pos,
          "water_belief": water_belief,
          "food_belief": food_belief,
          "water_urgency": water_urgency,
          "food_urgency": food_urgency,
          "found": None
      }

      communicated_successfully = False
      # Check for the presence of water or food at the sender's location
      if self.grid[pos[0], pos[1], 1] == 1:  # Water found
          message["found"] = "water"
          communicated_successfully = True
      elif self.grid[pos[0], pos[1], 2] == 1:  # Food found
          message["found"] = "food"
          communicated_successfully = True

      # If something was found, broadcast the message
      if communicated_successfully:
          for agent in self.agents:
              if agent != sender:
                  self.messages[agent] = message

      return communicated_successfully

    def terminate_agent(self, agent):
        # logic for handling agent termination
        self.agents.remove(agent)  # Remove the agent from the active list
        self.grid[self.agent_states[agent]["position"][0], self.agent_states[agent]["position"][1], 0] = 0  # Clear the agent from the grid
        del self.agent_states[agent]  # Remove the agent's state


    def observe(self, agent):
      # Return agent-specific observations including both grid and their internal state
      observation = self.grid.copy()
      agent_state = self.agent_states[agent]
      observed_message = self.messages[agent]

      # If there's a message, update beliefs based on the message
      if observed_message:
          if observed_message["found"] == "water":
              agent_state["beliefs"]["water_replenish_rate"] = observed_message["water_belief"]
          elif observed_message["found"] == "food":
              agent_state["beliefs"]["food_replenish_rate"] = observed_message["food_belief"]

      return {"grid": observation, "state": agent_state, "message": observed_message}

    def update_resources(self):
      # Iterate through each resource to update its status
      for resource, counter in self.resource_counters.items():
          if counter[0] <= 0:  # If depleted
              counter[1] -= 1  # Decrement replenishment timer
              if counter[1] <= 0:  # If it's time to replenish
                  self.replenish_resource(resource)

    def replenish_resource(self, resource):
      # Randomly choose a new position for the resource
      position = np.random.choice(self.grid_size**2)
      if resource == "water":
          self.grid[position // self.grid_size, position % self.grid_size, 1] = 1  # Place water
          self.resource_counters["water"] = [np.random.randint(1, 5), np.random.randint(5, 15)]  # Reset counters
      elif resource == "food":
          self.grid[position // self.grid_size, position % self.grid_size, 2] = 1  # Place food
          self.resource_counters["food"] = [np.random.randint(1, 5), np.random.randint(5, 15)]  # Reset counters

    def render(self, mode="human"):
        # Visualize the current state of the environment, including agent positions, resources, and timers
        if mode == "human":
          for r in range(self.grid_size):
              print("+---" * self.grid_size + "+")
              for c in range(self.grid_size):
                  cell = " "
                  if self.grid[r, c, 0] > 0:  # Agent presence
                      cell = "A"
                  elif self.grid[r, c, 1] == 1:  # Water
                      cell = "W"
                  elif self.grid[r, c, 2] == 1:  # Food
                      cell = "F"
                  print(f"| {cell} ", end="")
              print("|")
          print("+---" * self.grid_size + "+")
        elif mode == "rgb_array":
          self.ax.clear()
          self.ax.axis('off')

          # Draw the grid
          for r in range(self.grid_size):
              for c in range(self.grid_size):
                  if self.grid[r, c, 0] > 0:  # Agent presence
                      self.ax.text(c, r, 'A', fontsize=12, ha='center', va='center', color='blue')
                  elif self.grid[r, c, 1] == 1:  # Water
                      self.ax.text(c, r, 'W', fontsize=12, ha='center', va='center', color='cyan')
                  elif self.grid[r, c, 2] == 1:  # Food
                      self.ax.text(c, r, 'F', fontsize=12, ha='center', va='center', color='green')

          # Draw grid lines
          self.ax.set_xticks(np.arange(-0.5, self.grid_size, 1), minor=True)
          self.ax.set_yticks(np.arange(-0.5, self.grid_size, 1), minor=True)
          self.ax.grid(which="minor", color="w", linestyle='-', linewidth=2)
          self.ax.tick_params(which="minor", size=0)

          # Update the display
          self.fig.canvas.draw()
          plt.pause(0.01)  # Small delay to allow for real-time updating




In [58]:
# Usage
env = ForagingEnv()
env.reset()
print(env.observe(env.agents[0]))

{'grid': array([[[2., 0., 0.],
        [0., 0., 0.],
        [0., 0., 1.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 1., 0.]],

       [[1., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]], dtype=float32), 'state': {'position': (2, 0), 'water_timer': 3, 'food_timer': 7, 'beliefs': {'water_replenish_rate': inf, 'food_replenish_rate': inf}}, 'message': ''}


In [59]:
# Pseudocode for a single step in the environment
obs = env.reset()
done = False
while not done:
    # Assume agents can select actions based on observations
    actions = [agent.act(obs[agent_id]) for agent_id, agent in enumerate(agents)]
    next_obs, rewards, done, _ = env.step(actions)
    # Store (obs, actions, rewards, next_obs) in replay buffer
    # Sample a batch from the replay buffer and update both actor and critic models
    obs = next_obs

AttributeError: 'MADDPGAgent' object has no attribute 'act'

In [53]:
# Initialize environment
env = ForagingEnv()

# Initialize agents with their actor and critic networks
agents = [MADDPGAgent(obs_size=env.observation_size, action_size=env.action_size, n_agents=env.n_agents) for _ in range(env.n_agents)]

# Initialize replay buffer
replay_buffer = ReplayBuffer()

for episode in range(num_episodes):
    state = env.reset()
    while not done:
        actions = [agent.act(state[agent_id]) for agent_id, agent in enumerate(agents)]
        next_state, rewards, done, _ = env.step(actions)

        # Store experience in replay buffer
        replay_buffer.add(state, actions, rewards, next_state, done)

        # Sample a batch of experiences from the buffer
        batch = replay_buffer.sample()

        # Update each agent - this involves updating both the actor and critic networks
        for agent_id, agent in enumerate(agents):
            agent.update(batch, agent_id, agents)

        state = next_state

NameError: name 'ReplayBuffer' is not defined