### GridWorld Environment

In [25]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces
from typing import Dict, List, Tuple, Optional


class DeliveryGridWorld(gym.Env):
    """
    A gridworld environment for multiple delivery bots.
    
    Each agent needs to pick up packages and deliver them to destinations.
    """
    
    def __init__(self, grid_size=10, num_agents=4, num_packages=8, max_steps=200):
        super(DeliveryGridWorld, self).__init__()
        
        self.grid_size = grid_size
        self.num_agents = num_agents
        self.num_packages = num_packages
        self.max_steps = max_steps
        self.current_step = 0
        
        # Action space: move up, down, left, right, pickup/deliver
        self.action_space = spaces.Discrete(5)
        
        # Observation space per agent: 
        # - Agent position (x, y)
        # - Agent carrying status
        # - Positions of all packages with delivery status
        # - Positions of all destinations
        # - Positions of other agents
        
        obs_size = 2 + 1 + (3 * num_packages) + (2 * num_packages) + (2 * num_agents - 2)
        self.observation_space = spaces.Box(
            low=0, high=grid_size-1, shape=(obs_size,), dtype=np.float32
        )
        
        # Initialize grid and positions
        self.grid = np.zeros((grid_size, grid_size), dtype=int)
        self.reset()
    
    def reset(self, seed=None, options=None):
        """Reset the environment to initial state."""
        super().reset(seed=seed)
        self.current_step = 0
        
        # Clear grid
        self.grid = np.zeros((self.grid_size, self.grid_size), dtype=int)
        
        # Initialize agent positions (avoid collisions)
        self.agent_positions = []
        self.agent_carrying = [None] * self.num_agents
        
        for _ in range(self.num_agents):
            while True:
                pos = (
                    self.np_random.integers(0, self.grid_size),
                    self.np_random.integers(0, self.grid_size)
                )
                if pos not in self.agent_positions:
                    self.agent_positions.append(pos)
                    break
        
        # Initialize package positions and destinations
        self.packages = []
        self.destinations = []
        self.package_status = [0] * self.num_packages  # 0: waiting, 1: picked, 2: delivered
        
        # Define package-destination mapping - CRITICAL FIX
        self.package_destinations = list(range(self.num_packages))
        
        for _ in range(self.num_packages):
            # Package position
            while True:
                pkg_pos = (
                    self.np_random.integers(0, self.grid_size),
                    self.np_random.integers(0, self.grid_size)
                )
                if (pkg_pos not in self.agent_positions and 
                    pkg_pos not in self.packages):
                    self.packages.append(pkg_pos)
                    break
            
            # Destination position
            while True:
                dest_pos = (
                    self.np_random.integers(0, self.grid_size),
                    self.np_random.integers(0, self.grid_size)
                )
                if (dest_pos not in self.agent_positions and 
                    dest_pos not in self.packages and
                    dest_pos not in self.destinations):
                    self.destinations.append(dest_pos)
                    break
        
        # Print initial state for debugging
        print("Environment Reset")
        print(f"Grid Size: {self.grid_size}x{self.grid_size}")
        print(f"Number of Agents: {self.num_agents}")
        print(f"Number of Packages: {self.num_packages}")
        print(f"Agent Positions: {self.agent_positions}")
        print(f"Package Positions: {self.packages}")
        print(f"Destination Positions: {self.destinations}")
        print(f"Package-Destination Mapping: {self.package_destinations}")
        
        return self._get_observations(), {}
    
    def _get_observations(self):
        """Get observations for all agents."""
        observations = {}
        
        for agent_id in range(self.num_agents):
            obs = self._get_agent_observation(agent_id)
            observations[agent_id] = obs
            
        return observations
    
    def _get_agent_observation(self, agent_id):
        """Get observation for a specific agent."""
        agent_pos = self.agent_positions[agent_id]
        
        # Agent's own position
        obs = [agent_pos[0] / (self.grid_size - 1), agent_pos[1] / (self.grid_size - 1)]
        
        # Carrying status (0 if not carrying, index+1 of package if carrying)
        carrying = 0 if self.agent_carrying[agent_id] is None else self.agent_carrying[agent_id] + 1
        obs.append(carrying / (self.num_packages + 1))
        
        # Package positions and status
        for i in range(self.num_packages):
            if self.package_status[i] == 2:  # Delivered
                # Use (-1, -1) for delivered packages
                obs.extend([-1.0, -1.0, 1.0])
            else:
                package_pos = self.packages[i]
                obs.extend([
                    package_pos[0] / (self.grid_size - 1),
                    package_pos[1] / (self.grid_size - 1),
                    self.package_status[i] / 2.0  # Normalize status
                ])
        
        # Destination positions
        for dest_pos in self.destinations:
            obs.extend([dest_pos[0] / (self.grid_size - 1), dest_pos[1] / (self.grid_size - 1)])
        
        # Other agents' positions
        for i in range(self.num_agents):
            if i != agent_id:
                other_pos = self.agent_positions[i]
                obs.extend([other_pos[0] / (self.grid_size - 1), other_pos[1] / (self.grid_size - 1)])
        
        return np.array(obs, dtype=np.float32)
    
    def step(self, actions):
        """Execute actions for all agents."""
        self.current_step += 1
        
        rewards = {agent_id: 0.0 for agent_id in range(self.num_agents)}
        infos = {agent_id: {} for agent_id in range(self.num_agents)}
        
        # Print current state before actions
        if self.current_step % 10 == 0:
            print(f"\n--- Step {self.current_step} ---")
            print(f"Agent Positions: {self.agent_positions}")
            for i in range(self.num_agents):
                carrying = "None" if self.agent_carrying[i] is None else f"Package {self.agent_carrying[i]}"
                print(f"Agent {i} at {self.agent_positions[i]} carrying: {carrying}")
        
        # Process movements (with collision prevention)
        new_positions = []
        
        for agent_id, action in actions.items():
            # Get current position
            current_pos = self.agent_positions[agent_id]
            new_pos = list(current_pos)
            
            # Movement actions
            if action == 0:  # Up
                new_pos[1] = max(0, current_pos[1] - 1)
            elif action == 1:  # Down
                new_pos[1] = min(self.grid_size - 1, current_pos[1] + 1)
            elif action == 2:  # Left
                new_pos[0] = max(0, current_pos[0] - 1)
            elif action == 3:  # Right
                new_pos[0] = min(self.grid_size - 1, current_pos[0] + 1)
            
            new_positions.append(tuple(new_pos))
            
            if self.current_step % 10 == 0:
                print(f"Agent {agent_id} action: {action} (0=Up, 1=Down, 2=Left, 3=Right, 4=Pick/Deliver)")
                if action < 4:
                    print(f"  Moving from {current_pos} to {new_pos}")
        
        # Check for collisions and update positions
        for agent_id, new_pos in enumerate(new_positions):
            # Only move if no collision with another agent
            collision = False
            for other_id, other_new_pos in enumerate(new_positions):
                if agent_id != other_id and new_pos == other_new_pos:
                    collision = True
                    break
            
            if not collision and new_pos not in self.agent_positions[:agent_id] + self.agent_positions[agent_id+1:]:
                self.agent_positions[agent_id] = new_pos
            else:
                # Small penalty for collision
                rewards[agent_id] -= 0.1
                if self.current_step % 10 == 0:
                    print(f"  Agent {agent_id} collision detected! Staying at {self.agent_positions[agent_id]}")
        
        # Process pickup/deliver actions
        for agent_id, action in actions.items():
            if action == 4:  # Pickup or deliver
                current_pos = self.agent_positions[agent_id]
                
                if self.agent_carrying[agent_id] is None:
                    # Try to pick up a package
                    for pkg_idx, pkg_pos in enumerate(self.packages):
                        if current_pos == pkg_pos and self.package_status[pkg_idx] == 0:
                            self.agent_carrying[agent_id] = pkg_idx
                            self.package_status[pkg_idx] = 1
                            rewards[agent_id] += 1.0  # Reward for picking up
                            print(f"Agent {agent_id} picked up package {pkg_idx} at {current_pos}")


                            break
                else:
                    # Try to deliver a package
                    pkg_idx = self.agent_carrying[agent_id]
                    dest_idx = self.package_destinations[pkg_idx]  # Get the correct destination for this package
                    
                    if current_pos == self.destinations[dest_idx]:
                        self.package_status[pkg_idx] = 2  # Mark as delivered
                        self.agent_carrying[agent_id] = None
                        rewards[agent_id] += 5.0  # Larger reward for delivery
                        print(f"🎉 SUCCESS! Agent {agent_id} delivered package {pkg_idx} to destination {dest_idx} at {current_pos}")
                    else:
                        if self.current_step % 10 == 0:
                            print(f"Agent {agent_id} tried to deliver package {pkg_idx} at {current_pos} but this is not destination {dest_idx} ({self.destinations[dest_idx]})")
        
        # Small penalty for each step to encourage efficiency
        for agent_id in range(self.num_agents):
            rewards[agent_id] -= 0.01
        
        # Check if all packages are delivered
        done = all(status == 2 for status in self.package_status)
        
        # Print delivery status
        if self.current_step % 10 == 0:
            delivered_count = sum(1 for status in self.package_status if status == 2)
            print(f"Packages delivered: {delivered_count}/{self.num_packages}")
            for i, status in enumerate(self.package_status):
                status_text = "Not Picked" if status == 0 else ("Picked Up" if status == 1 else "Delivered")
                print(f"  Package {i}: {status_text}")
        
        # Check if max steps reached
        if self.current_step >= self.max_steps:
            print(f"\nMax steps ({self.max_steps}) reached!")
            done = True
        
        truncated = {agent_id: False for agent_id in range(self.num_agents)}
        dones = {agent_id: done for agent_id in range(self.num_agents)}
        dones["__all__"] = done
        
        return self._get_observations(), rewards, dones, truncated, infos
    
    def render(self, mode='human'):
        """Simple text-based rendering."""
        grid_render = [['.' for _ in range(self.grid_size)] for _ in range(self.grid_size)]
        
        # Mark destinations
        for i, dest_pos in enumerate(self.destinations):
            grid_render[dest_pos[1]][dest_pos[0]] = f'D{i}'
        
        # Mark packages
        for i, pkg_pos in enumerate(self.packages):
            if self.package_status[i] == 0:  # Only show if not picked up
                grid_render[pkg_pos[1]][pkg_pos[0]] = f'P{i}'
        
        # Mark agents
        for i, agent_pos in enumerate(self.agent_positions):
            carrying = ''
            if self.agent_carrying[i] is not None:
                carrying = f'({self.agent_carrying[i]})'
            grid_render[agent_pos[1]][agent_pos[0]] = f'A{i}{carrying}'
        
        # Print grid
        for row in grid_render:
            print(' '.join(row))
        print("\n")
    
    def get_sumo_data(self):
        """
        Return data that can be used for SUMO visualization.
        """
        sumo_data = {
            'grid_size': self.grid_size,
            'agents': [],
            'packages': [],
            'destinations': []
        }
        
        # Agent data
        for i, pos in enumerate(self.agent_positions):
            agent_data = {
                'id': i,
                'x': pos[0],
                'y': pos[1],
                'carrying': self.agent_carrying[i]
            }
            sumo_data['agents'].append(agent_data)
        
        # Package data
        for i in range(self.num_packages):
            if self.package_status[i] < 2:  # Only include non-delivered packages
                pkg_data = {
                    'id': i,
                    'x': self.packages[i][0],
                    'y': self.packages[i][1],
                    'status': self.package_status[i]
                }
                sumo_data['packages'].append(pkg_data)
        
        # Destination data
        for i, pos in enumerate(self.destinations):
            dest_data = {
                'id': i,
                'x': pos[0],
                'y': pos[1]
            }
            sumo_data['destinations'].append(dest_data)
        
        return sumo_data

## PPO Training

In [8]:
import os
import argparse
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ActorNetwork(nn.Module):
    def __init__(self, input_dims, n_actions, alpha=0.0003):
        super(ActorNetwork, self).__init__()
        
        self.actor = nn.Sequential(
            nn.Linear(input_dims, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, n_actions),
            nn.Softmax(dim=-1)
        )
        
        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.to(device)
    
    def forward(self, state):
        dist = self.actor(state)
        dist = Categorical(dist)
        
        return dist

class CriticNetwork(nn.Module):
    def __init__(self, input_dims, alpha=0.0003):
        super(CriticNetwork, self).__init__()
        
        self.critic = nn.Sequential(
            nn.Linear(input_dims, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )
        
        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.to(device)
    
    def forward(self, state):
        value = self.critic(state)
        
        return value

class Agent:
    def __init__(self, input_dims, n_actions, gamma=0.99, alpha=0.0003, 
                 gae_lambda=0.95, policy_clip=0.2, batch_size=64, 
                 n_epochs=10):
        self.gamma = gamma
        self.policy_clip = policy_clip
        self.n_epochs = n_epochs
        self.gae_lambda = gae_lambda
        
        self.actor = ActorNetwork(input_dims, n_actions, alpha)
        self.critic = CriticNetwork(input_dims, alpha)
        self.memory = PPOMemory(batch_size)
    
    def remember(self, state, action, probs, vals, reward, done):
        self.memory.store_memory(state, action, probs, vals, reward, done)
    
    def save_models(self, path):
        os.makedirs(path, exist_ok=True)
        torch.save(self.actor.state_dict(), os.path.join(path, 'actor.pth'))
        torch.save(self.critic.state_dict(), os.path.join(path, 'critic.pth'))
    
    # def load_models(self, path):
    #     self.actor.load_state_dict(torch.load(os.path.join(path, 'actor.pth')))
    #     self.critic.load_state_dict(torch.load(os.path.join(path, 'critic.pth')))

    def load_models(self, path):
        actor_state_dict = torch.load(os.path.join(path, 'actor.pth'))
        critic_state_dict = torch.load(os.path.join(path, 'critic.pth'))
        
        # Get the input dimension from the saved model
        input_dims = actor_state_dict['actor.0.weight'].shape[1]
        
        # Recreate the networks with the correct input size
        self.actor = ActorNetwork(input_dims, self.actor.actor[-2].out_features)
        self.critic = CriticNetwork(input_dims)
        
        # Load the state dictionaries
        self.actor.load_state_dict(actor_state_dict)
        self.critic.load_state_dict(critic_state_dict)
    
    def choose_action(self, observation):
        state = torch.tensor([observation], dtype=torch.float).to(device)
        
        dist = self.actor(state)
        value = self.critic(state)
        action = dist.sample()
        
        probs = torch.squeeze(dist.log_prob(action)).item()
        action = torch.squeeze(action).item()
        value = torch.squeeze(value).item()
        
        return action, probs, value
    
    def learn(self):
        for _ in range(self.n_epochs):
            state_arr, action_arr, old_prob_arr, vals_arr, \
            reward_arr, dones_arr, batches = self.memory.generate_batches()
            
            values = vals_arr
            advantage = np.zeros(len(reward_arr), dtype=np.float32)
            
            for t in range(len(reward_arr)-1):
                discount = 1
                a_t = 0
                for k in range(t, len(reward_arr)-1):
                    a_t += discount * (reward_arr[k] + self.gamma * values[k+1] * (1-dones_arr[k]) - values[k])
                    discount *= self.gamma * self.gae_lambda
                advantage[t] = a_t
            
            advantage = torch.tensor(advantage).to(device)
            values = torch.tensor(values).to(device)
            
            for batch in batches:
                states = torch.tensor(state_arr[batch], dtype=torch.float).to(device)
                old_probs = torch.tensor(old_prob_arr[batch]).to(device)
                actions = torch.tensor(action_arr[batch]).to(device)
                
                dist = self.actor(states)
                critic_value = self.critic(states)
                
                critic_value = torch.squeeze(critic_value)
                
                new_probs = dist.log_prob(actions)
                prob_ratio = new_probs.exp() / old_probs.exp()
                weighted_probs = advantage[batch] * prob_ratio
                weighted_clipped_probs = torch.clamp(prob_ratio, 1-self.policy_clip, 1+self.policy_clip) * advantage[batch]
                
                actor_loss = -torch.min(weighted_probs, weighted_clipped_probs).mean()
                
                returns = advantage[batch] + values[batch]
                critic_loss = (returns - critic_value)**2
                critic_loss = critic_loss.mean()
                
                total_loss = actor_loss + 0.5 * critic_loss
                
                self.actor.optimizer.zero_grad()
                self.critic.optimizer.zero_grad()
                total_loss.backward()
                self.actor.optimizer.step()
                self.critic.optimizer.step()
        
        self.memory.clear_memory()





### PPO memory

In [7]:
import os
import argparse
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



class PPOMemory:
    def __init__(self, batch_size):
        self.states = []
        self.actions = []
        self.probs = []
        self.vals = []
        self.rewards = []
        self.dones = []
        self.batch_size = batch_size
    
    def store_memory(self, state, action, prob, val, reward, done):
        self.states.append(state)
        self.actions.append(action)
        self.probs.append(prob)
        self.vals.append(val)
        self.rewards.append(reward)
        self.dones.append(done)
    
    def clear_memory(self):
        self.states = []
        self.actions = []
        self.probs = []
        self.vals = []
        self.rewards = []
        self.dones = []
    
    def generate_batches(self):
        n_states = len(self.states)
        batch_start = np.arange(0, n_states, self.batch_size)
        indices = np.arange(n_states, dtype=np.int64)
        np.random.shuffle(indices)
        batches = [indices[i:i+self.batch_size] for i in batch_start]
        
        return np.array(self.states), np.array(self.actions), \
               np.array(self.probs), np.array(self.vals), \
               np.array(self.rewards), np.array(self.dones), batches


### Training the agents

In [10]:
def train_agents(env, agents, n_episodes=1000, save_interval=100, checkpoint_dir='checkpoints'):
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    best_reward = -np.inf
    score_history = []
    
    for episode in range(n_episodes):
        obs, _ = env.reset()
        done = False
        score = 0
        
        while not done:
            actions = {}
            for agent_id, agent in enumerate(agents):
                # Get observation for the specific agent
                observation = obs[agent_id]
                action, prob, val = agent.choose_action(observation)
                actions[agent_id] = action
                
            next_obs, rewards, dones, truncated, _ = env.step(actions)
            
            # Store transitions in agents' memories
            for agent_id, agent in enumerate(agents):
                agent.remember(obs[agent_id], actions[agent_id], prob, val, rewards[agent_id], dones[agent_id])
                score += rewards[agent_id]
            
            # Update observations
            obs = next_obs
            done = dones["__all__"]
        
        # Learning step for all agents after episode completion
        for agent in agents:
            agent.learn()
        
        score_history.append(score)
        avg_score = np.mean(score_history[-100:]) if len(score_history) >= 100 else np.mean(score_history)
        
        if episode % 10 == 0:
            print(f'Episode: {episode}, Score: {score:.1f}, Avg Score: {avg_score:.1f}')
        
        # Save models if performance improves
        if avg_score > best_reward and episode > 100:
            best_reward = avg_score
            for agent_id, agent in enumerate(agents):
                agent.save_models(os.path.join(checkpoint_dir, f'agent_{agent_id}'))
        
        # Regular checkpoint saving
        if episode % save_interval == 0 and episode > 0:
            checkpoint_episode_dir = os.path.join(checkpoint_dir, f'checkpoint_{episode}')
            os.makedirs(checkpoint_episode_dir, exist_ok=True)
            for agent_id, agent in enumerate(agents):
                agent.save_models(os.path.join(checkpoint_episode_dir, f'agent_{agent_id}'))




if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Train multi-agent delivery bots')
    parser.add_argument('--grid_size', type=int, default=10, help='Size of the grid world')
    parser.add_argument('--num_agents', type=int, default=4, help='Number of agents')
    parser.add_argument('--num_packages', type=int, default=8, help='Number of packages')
    parser.add_argument('--episodes', type=int, default=2000, help='Number of training episodes')
    parser.add_argument('--save_interval', type=int, default=200, help='Episodes between checkpoints')
    # args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    # Create environment
    env = DeliveryGridWorld(
        grid_size=args.grid_size,
        num_agents=args.num_agents,
        num_packages=args.num_packages
    )
    
    # Create a temporary observation to get the actual observation size
    temp_obs, _ = env.reset()
    obs_dim = temp_obs[0].shape[0]  # Get the size of one agent's observation
    
    # Create agents with the correct input dimensions
    agents = []
    for _ in range(args.num_agents):
        agent = Agent(
            input_dims=obs_dim,
            n_actions=env.action_space.n,
            batch_size=64,
            alpha=0.0003,
            gamma=0.99
        )
        agents.append(agent)
    

    args.episodes = 3500
    # Train agents
    train_agents(
        env=env,
        agents=agents,
        n_episodes=args.episodes,
        save_interval=args.save_interval
    )



Episode: 0, Score: 0.4, Avg Score: 0.4
Episode: 10, Score: -6.7, Avg Score: -4.7
Episode: 20, Score: -7.5, Avg Score: -5.3
Episode: 30, Score: -2.0, Avg Score: -4.4
Episode: 40, Score: -1.9, Avg Score: -4.5
Episode: 50, Score: -10.4, Avg Score: -4.3
Episode: 60, Score: -2.0, Avg Score: -4.5
Episode: 70, Score: -3.6, Avg Score: -4.6
Episode: 80, Score: -9.3, Avg Score: -4.8
Episode: 90, Score: -5.9, Avg Score: -4.9
Episode: 100, Score: -6.7, Avg Score: -5.1
Episode: 110, Score: -5.5, Avg Score: -5.4
Episode: 120, Score: -5.2, Avg Score: -5.4
Episode: 130, Score: -4.2, Avg Score: -5.5
Episode: 140, Score: -4.4, Avg Score: -5.5
Episode: 150, Score: -6.9, Avg Score: -5.7
Episode: 160, Score: -1.5, Avg Score: -5.7
Episode: 170, Score: -0.5, Avg Score: -5.6
Episode: 180, Score: -7.0, Avg Score: -5.6
Episode: 190, Score: -6.9, Avg Score: -5.3
Episode: 200, Score: -6.1, Avg Score: -5.4
Episode: 210, Score: -6.6, Avg Score: -5.1
Episode: 220, Score: -4.4, Avg Score: -4.8
Episode: 230, Score: -5

## Preparing Config Files for Sumo Visualization

In [None]:
import os
import argparse
import numpy as np
import torch
import traci
import traci.constants as tc
import subprocess
import tempfile
import time
# from delivery_environment import DeliveryGridWorld

def generate_sumo_config(grid_size, temp_dir):
    """Generate SUMO configuration files for visualization."""
    
    # Scale factor to make the grid visualization better in SUMO
    scale = 1 # enlarges 10 times
    
    # Create nodes file
    nodes_file = os.path.join(temp_dir, "delivery_grid.nod.xml")
    with open(nodes_file, "w") as f:
        f.write('<nodes>\n')
        # Create nodes for each intersection in the grid
        for y in range(grid_size + 1):
            for x in range(grid_size + 1):
                f.write(f'    <node id="n{x}_{y}" x="{x * scale}" y="{y * scale}" type="priority"/>\n')
        f.write('</nodes>\n')
    
    # Create edges file
    edges_file = os.path.join(temp_dir, "delivery_grid.edg.xml")
    with open(edges_file, "w") as f:
        f.write('<edges>\n')
        # Create horizontal edges
        for y in range(grid_size + 1):
            for x in range(grid_size):
                f.write(f'    <edge id="h{x}_{y}" from="n{x}_{y}" to="n{x+1}_{y}" numLanes="1" speed="13.89"/>\n')
                f.write(f'    <edge id="h{x}_{y}_back" from="n{x+1}_{y}" to="n{x}_{y}" numLanes="1" speed="13.89"/>\n')
        
        # Create vertical edges
        for x in range(grid_size + 1):
            for y in range(grid_size):
                f.write(f'    <edge id="v{x}_{y}" from="n{x}_{y}" to="n{x}_{y+1}" numLanes="1" speed="13.89"/>\n')
                f.write(f'    <edge id="v{x}_{y}_back" from="n{x}_{y+1}" to="n{x}_{y}" numLanes="1" speed="13.89"/>\n')
        f.write('</edges>\n')
    
    # Create netconvert command
    netconvert_cmd = [
        "netconvert",
        "--node-files", nodes_file,
        "--edge-files", edges_file,
        "--output-file", os.path.join(temp_dir, "delivery_grid.net.xml"),
        "--no-turnarounds", "true"
    ]
    
    # Run netconvert
    subprocess.run(netconvert_cmd)
    
    # Create routes file
    routes_file = os.path.join(temp_dir, "delivery_grid.rou.xml")
    with open(routes_file, "w") as f:
        f.write('<routes>\n')
        # Define vehicle types - Fix: Use valid SUMO shapes
        f.write('    <vType id="agent" length="2" minGap="1" maxSpeed="5" guiShape="passenger" color="0,0,255"/>\n')
        f.write('    <vType id="package" length="1" minGap="1" maxSpeed="0.1" guiShape="delivery" color="255,0,0"/>\n')
        f.write('    <vType id="destination" length="1" minGap="1" maxSpeed="0.1" guiShape="truck" color="0,255,0"/>\n')
        
        # Create a route that covers the entire grid
        f.write('    <route id="grid_route" edges="')
        for y in range(grid_size + 1):
            for x in range(grid_size):
                f.write(f"h{x}_{y} ")
            if y < grid_size:
                f.write(f"v{grid_size}_{y} ")
                for x in range(grid_size-1, -1, -1):
                    f.write(f"h{x}_{y+1}_back ")
                if y < grid_size-1:
                    f.write(f"v0_{y+1}_back ")
        f.write('"/>\n')
        f.write('</routes>\n')
    
    # Create SUMO configuration file
    sumo_config = os.path.join(temp_dir, "delivery_grid.sumocfg")
    with open(sumo_config, "w") as f:
        f.write('<configuration>\n')
        f.write('    <input>\n')
        f.write(f'        <net-file value="delivery_grid.net.xml"/>\n')
        f.write(f'        <route-files value="delivery_grid.rou.xml"/>\n')
        f.write('    </input>\n')
        f.write('    <time>\n')
        f.write('        <begin value="0"/>\n')
        f.write('        <end value="1000"/>\n')
        f.write('    </time>\n')
        f.write('</configuration>\n')
    
    return sumo_config

def get_nearest_edge(x, y, grid_size, scale=100):
    """Get the nearest edge ID for a given position."""
    # Find nearest node coordinates
    node_x = round(x / scale) * scale
    node_y = round(y / scale) * scale
    
    # Convert to node indices
    nx = int(node_x / scale)
    ny = int(node_y / scale)
    
    # Ensure we're within bounds
    nx = max(0, min(nx, grid_size))
    ny = max(0, min(ny, grid_size))
    
    # Determine which edge to use based on proximity to the nearest node
    if abs(x - node_x) > abs(y - node_y):
        # Closer to horizontal edge
        if x > node_x and nx < grid_size:
            return f"h{nx}_{ny}"
        elif nx > 0:
            return f"h{nx-1}_{ny}_back"
    else:
        # Closer to vertical edge
        if y > node_y and ny < grid_size:
            return f"v{nx}_{ny}"
        elif ny > 0:
            return f"v{nx}_{ny-1}_back"
    
    # Fallback to a default edge
    if grid_size > 0:
        return f"h0_0"
    return ""



def update_sumo_visualization(env):
    """Create a completely new visualization from scratch each time."""
    # Remove all existing vehicles first
    for veh_id in traci.vehicle.getIDList():
        traci.vehicle.remove(veh_id)
    
    scale = 100  # Scale factor for visualization
    
    # Add agents as vehicles with distinctive appearance
    for i, pos in enumerate(env.agent_positions):
        agent_id = f"agent_{i}"
        x, y = pos
        sumo_x = x * scale + scale/2
        sumo_y = y * scale + scale/2
        
        # Add vehicle
        traci.vehicle.add(agent_id, "grid_route", typeID="agent")
        traci.vehicle.moveToXY(agent_id, "", 0, sumo_x, sumo_y, angle=0, keepRoute=2)
        
        # Make it more visible
        traci.vehicle.setColor(agent_id, (0, 0, 255, 255))  # Blue
        traci.vehicle.setWidth(agent_id, 10.0)  # Much wider
        traci.vehicle.setLength(agent_id, 16.0)  # Much longer
        
        # Change color if carrying a package
        if env.agent_carrying[i] is not None:
            traci.vehicle.setColor(agent_id, (255, 0, 255, 255))  # Purple when carrying
    
    # Add packages as polygons
    for i, pos in enumerate(env.packages):
        pkg_id = f"package_{i}"
        
        # Only show if not picked up or delivered
        if env.package_status[i] == 0:  # Not picked up yet
            x, y = pos
            sumo_x = x * scale + scale/2
            sumo_y = y * scale + scale/2
            
            # Create a square polygon
            size = 10  # Size of the square
            shape = [(sumo_x-size, sumo_y-size), 
                     (sumo_x+size, sumo_y-size),
                     (sumo_x+size, sumo_y+size),
                     (sumo_x-size, sumo_y+size)]
            
            # Remove if already exists
            if pkg_id in traci.polygon.getIDList():
                traci.polygon.remove(pkg_id)
                
            # Add polygon
            traci.polygon.add(pkg_id, shape, (255, 0, 0, 255), True, "")
    
    # Add destinations as polygons
    for i, pos in enumerate(env.destinations):
        dest_id = f"dest_{i}"
        x, y = pos
        sumo_x = x * scale + scale/2
        sumo_y = y * scale + scale/2
        
        # Create a square polygon
        size = 15  # Size of the square
        shape = [(sumo_x-size, sumo_y-size), 
                 (sumo_x+size, sumo_y-size),
                 (sumo_x+size, sumo_y+size),
                 (sumo_x-size, sumo_y+size)]
        
        # Remove if already exists
        if dest_id in traci.polygon.getIDList():
            traci.polygon.remove(dest_id)
            
        # Add polygon
        traci.polygon.add(dest_id, shape, (0, 255, 0, 255), True, "")

def generate_sumo_config(grid_size, temp_dir):
    """Generate SUMO configuration files for visualization."""
    
    # Scale factor to make the grid visualization better in SUMO
    scale = 100
    
    # Create nodes file
    nodes_file = os.path.join(temp_dir, "delivery_grid.nod.xml")
    with open(nodes_file, "w") as f:
        f.write('<nodes>\n')
        # Create nodes for each intersection in the grid
        for y in range(grid_size + 1):
            for x in range(grid_size + 1):
                # Make nodes visible
                f.write(f'    <node id="n{x}_{y}" x="{x * scale}" y="{y * scale}" type="priority"/>\n')
        f.write('</nodes>\n')
    
    # Create edges file
    edges_file = os.path.join(temp_dir, "delivery_grid.edg.xml")
    with open(edges_file, "w") as f:
        f.write('<edges>\n')
        # Create horizontal edges - just make a simple grid
        for y in range(grid_size + 1):
            for x in range(grid_size):
                f.write(f'    <edge id="h{x}_{y}" from="n{x}_{y}" to="n{x+1}_{y}" numLanes="1" speed="13.89"/>\n')
        
        # Create vertical edges
        for x in range(grid_size + 1):
            for y in range(grid_size):
                f.write(f'    <edge id="v{x}_{y}" from="n{x}_{y}" to="n{x}_{y+1}" numLanes="1" speed="13.89"/>\n')
        f.write('</edges>\n')
    
    # Create netconvert command - make sure it runs correctly
    netconvert_cmd = [
        "netconvert",
        "--node-files", nodes_file,
        "--edge-files", edges_file,
        "--output-file", os.path.join(temp_dir, "delivery_grid.net.xml"),
        "--no-turnarounds", "true"
    ]
    
    # Run netconvert
    try:
        subprocess.run(netconvert_cmd, check=True)
        print("SUMO network created successfully")
    except subprocess.CalledProcessError as e:
        print(f"Error creating SUMO network: {e}")
        return None
    
    # Create routes file with a simple route
    routes_file = os.path.join(temp_dir, "delivery_grid.rou.xml")
    with open(routes_file, "w") as f:
        f.write('<routes>\n')
        # Define vehicle types
        f.write('    <vType id="agent" length="2" minGap="1" maxSpeed="5" guiShape="passenger"/>\n')
        
        # Create a very simple route - just one edge is enough since we'll use moveToXY
        f.write('    <route id="grid_route" edges="h0_0"/>\n')
        f.write('</routes>\n')
    
    # Create SUMO configuration file
    sumo_config = os.path.join(temp_dir, "delivery_grid.sumocfg")
    with open(sumo_config, "w") as f:
        f.write('<configuration>\n')
        f.write('    <input>\n')
        f.write(f'        <net-file value="delivery_grid.net.xml"/>\n')
        f.write(f'        <route-files value="delivery_grid.rou.xml"/>\n')
        f.write('    </input>\n')
        f.write('    <time>\n')
        f.write('        <begin value="0"/>\n')
        f.write('        <end value="1000"/>\n')
        f.write('    </time>\n')
        f.write('    <gui_only>\n')
        f.write('        <gui-settings-file value="gui-settings.xml"/>\n')
        f.write('    </gui_only>\n')
        f.write('</configuration>\n')
    
    # Create GUI settings file to improve visibility
    gui_settings = os.path.join(temp_dir, "gui-settings.xml")
    with open(gui_settings, "w") as f:
        f.write('<viewsettings>\n')
        f.write('    <scheme name="real world"/>\n')
        f.write('    <delay value="50"/>\n')
        f.write('    <viewport zoom="100" x="500" y="500"/>\n')
        f.write('</viewsettings>\n')
    
    return sumo_config

def run_visualization(model_path, num_agents=4, grid_size=10, num_packages=8, steps=1000, delay=0.5):
    scale=100
    """Run a visualization of the trained agents."""
    # Create and use a fixed directory for SUMO files
    # temp_dir = "E:/Multi_agent_system DL project/sumo_training"
    temp_dir = "E:\Multi-Agent-Bot-Delivery-System"
    os.makedirs(temp_dir, exist_ok=True)
    
    # Generate SUMO config
    sumo_config = generate_sumo_config(grid_size, temp_dir)
    if sumo_config is None:
        print("Failed to create SUMO configuration.")
        return
    
    # Start SUMO with GUI
    try:
        # Make sure SUMO is in PATH or provide full path
        sumo_binary = "sumo-gui"
        sumo_cmd = [
            sumo_binary, 
            "-c", sumo_config,
            "--start",  # Start the simulation immediately
            "--quit-on-end"  # Quit when simulation ends
        ]
        
        # Start SUMO
        print("Starting SUMO...")
        traci.start(sumo_cmd)
        print("SUMO started successfully")
        
        # Set up environment
        env = DeliveryGridWorld(
            grid_size=grid_size,
            num_agents=num_agents,
            num_packages=num_packages
        )
        
        # Load agent models
        obs_dim = env.observation_space.shape[0]
        n_actions = env.action_space.n
        

        # Add this loop to ensure all packages are correctly visualized:
        for pkg_idx, status in enumerate(env.package_status):
            if status != 0:  # If picked up or delivered
                pkg_id = f"package_{pkg_idx}"
                if pkg_id in traci.polygon.getIDList():
                    traci.polygon.remove(pkg_id)
                    print(f"Fixed: Removed package {pkg_idx} from visualization (status={status})")

        agents = []
        for i in range(num_agents):
            agent = Agent(input_dims=obs_dim, n_actions=n_actions)
            agent.load_models(os.path.join(model_path, f'agent_{i}'))
            agents.append(agent)
        
        # Reset environment
        obs, _ = env.reset()
        done = False
        step_count = 0
        
        # Initial visualization setup
        print("Setting up visualization...")
        
        # Set zoom level
        traci.gui.setZoom("View #0", 100)
        
        # Initial rendering
        update_sumo_visualization(env)
        traci.simulationStep()
        
        # Wait a moment for the GUI to initialize
        time.sleep(1)
        
        print("Starting simulation run...")
        
        while not done and step_count < steps:
            if step_count % 10 == 0:
                print(f"\nStep {step_count}/{steps}")
            
            # Get actions from agents
            actions = {}
            for agent_id, agent in enumerate(agents):
                action, _, _ = agent.choose_action(obs[agent_id])
                actions[agent_id] = action
                print(f"Agent {agent_id} action: {action}")
            
            # Step environment
            next_obs, rewards, dones, _, _ = env.step(actions)
            
            # Print current state
            # for i, pos in enumerate(env.agent_positions):
            #     carrying = env.agent_carrying[i]
            #     carry_str = f"carrying pkg {carrying}" if carrying is not None else "not carrying"
            #     print(f"Agent {i}: pos={pos}, {carry_str}")
            

                # Add packages as polygons
            for i, pos in enumerate(env.packages):
                pkg_id = f"package_{i}"
                
                # Only show if not picked up or delivered
                if env.package_status[i] == 0:  # Not picked up yet
                    x, y = pos
                    sumo_x = x * scale + scale/2
                    sumo_y = y * scale + scale/2
                    
                    # Create a square polygon
                    size = 10  # Size of the square
                    shape = [(sumo_x-size, sumo_y-size), 
                            (sumo_x+size, sumo_y-size),
                            (sumo_x+size, sumo_y+size),
                            (sumo_x-size, sumo_y+size)]
                    
                    # Remove if already exists
                    if pkg_id in traci.polygon.getIDList():
                        traci.polygon.remove(pkg_id)
                        
                    # Add polygon
                    traci.polygon.add(pkg_id, shape, (255, 0, 0, 255), True, "")
                else:
                    # Package is either picked up or delivered - remove from visualization if exists
                    if pkg_id in traci.polygon.getIDList():
                        traci.polygon.remove(pkg_id)


            # Update SUMO visualization from scratch
            update_sumo_visualization(env)
            
            # Update simulation
            traci.simulationStep()
            
            # Update observations
            obs = next_obs
            done = dones["__all__"]
            step_count += 1
            
            # Add delay to make visualization viewable
            time.sleep(delay)
        
        print(f"Visualization complete after {step_count} steps")
        
    except Exception as e:
        print(f"Error during visualization: {e}")
    finally:
        # Close SUMO connection
        if traci.isLoaded():
            traci.close()
            print("SUMO connection closed")


## Visualization of agents in sumo

In [24]:
# Main function to run the visualization
if __name__ == "__main__":
    model_path="E:/Multi_agent_system DL project/sumo_training/checkpoints/checkpoint_400"

    parser = argparse.ArgumentParser(description='Visualize trained delivery agents in SUMO')
    parser.add_argument('--model_path', type=str, help='Path to trained models')
    parser.add_argument('--grid_size', type=int, default=10, help='Size of the grid world')
    parser.add_argument('--num_agents', type=int, default=4, help='Number of agents')
    parser.add_argument('--num_packages', type=int, default=8, help='Number of packages')
    parser.add_argument('--steps', type=int, default=500, help='Maximum steps to run')
    parser.add_argument('--delay', type=float, default=0.5, help='Delay between steps (seconds)')
    args, unknown = parser.parse_known_args()

    # If not provided externally, set it manually
    if args.model_path is None:
        args.model_path = model_path
    
    # args.num_packages = args.num_agents * 2  # Ensure enough packages for agents
    
    run_visualization(
        model_path=args.model_path,
        grid_size=args.grid_size,
        num_agents=args.num_agents,
        
        num_packages=args.num_packages,
        steps=args.steps,
        delay=args.delay
    )


SUMO network created successfully
Starting SUMO...
SUMO started successfully
Environment Reset
Grid Size: 10x10
Number of Agents: 4
Number of Packages: 8
Agent Positions: [(np.int64(5), np.int64(0)), (np.int64(3), np.int64(5)), (np.int64(3), np.int64(9)), (np.int64(8), np.int64(5))]
Package Positions: [(np.int64(4), np.int64(1)), (np.int64(7), np.int64(3)), (np.int64(2), np.int64(5)), (np.int64(1), np.int64(8)), (np.int64(1), np.int64(1)), (np.int64(0), np.int64(9)), (np.int64(8), np.int64(3)), (np.int64(4), np.int64(2))]
Destination Positions: [(np.int64(1), np.int64(3)), (np.int64(5), np.int64(5)), (np.int64(2), np.int64(1)), (np.int64(2), np.int64(2)), (np.int64(4), np.int64(0)), (np.int64(9), np.int64(2)), (np.int64(1), np.int64(9)), (np.int64(3), np.int64(8))]
Package-Destination Mapping: [0, 1, 2, 3, 4, 5, 6, 7]
Environment Reset
Grid Size: 10x10
Number of Agents: 4
Number of Packages: 8
Agent Positions: [(np.int64(3), np.int64(8)), (np.int64(8), np.int64(9)), (np.int64(6), np.in