# 1 - DQN

In [3]:
# sugarscape_dqn_training.py

import random
import numpy as np
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim

# Define the DQN agent class
class DQNAgent(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQNAgent, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(64, action_size)
        
        # Initialize weights
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.zeros_(self.fc1.bias)
        nn.init.xavier_uniform_(self.fc2.weight)
        nn.init.zeros_(self.fc2.bias)
        nn.init.xavier_uniform_(self.fc3.weight)
        nn.init.zeros_(self.fc3.bias)
    
    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        return self.fc3(x)

class SugarscapeEnvironmentDQN:
    def __init__(self, width=30, height=30, num_agents=400, cell_size=10, seed=42,
                 max_messages=5, max_timesteps=1000, visualize=False):
        self.width = width
        self.height = height
        self.num_agents = num_agents
        self.cell_size = cell_size
        self.seed = seed
        self.visualize = visualize
        self.max_messages = max_messages
        self.max_timesteps = max_timesteps

        # Initialize random number generators with the seed for reproducibility
        random.seed(self.seed)
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)

        self.params = {
            'max_sugar': 5,
            'growth_rate': 1,
            'sugar_peak_frequency': 0.05,
            'sugar_peak_spread': 3,
            'job_center_duration': (20, 50),
            'vision_range': 1,  # Reduced vision range for simplicity
            'message_expiry': 15,
            'max_relay_messages': 10,
            'exploration_probability': 0.1
        }

        self.job_centers = []
        self.sugar = np.zeros((self.height, self.width), dtype=int)
        self.create_initial_sugar_peaks()
        self.agents = self.initialize_agents()
        self.agent_positions = set((agent['x'], agent['y']) for agent in self.agents)
        self.dead_agents = []

        # DQN Components
        self.state_size = 5 + (2 * self.params['vision_range'] + 1) ** 2 + (3 * self.max_messages)
        self.action_size = 5  # Up, Down, Left, Right, Stay
        self.q_network = DQNAgent(self.state_size, self.action_size)
        self.target_network = DQNAgent(self.state_size, self.action_size)
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.target_network.eval()
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=1e-3)
        self.criterion = nn.MSELoss()
        self.replay_buffer = deque(maxlen=10000)
        self.batch_size = 64
        self.gamma = 0.99
        self.epsilon_start = 1.0
        self.epsilon_end = 0.1
        self.epsilon_decay = 10000
        self.epsilon = self.epsilon_start
        self.episode = 0  # For tracking episodes

    def create_initial_sugar_peaks(self, num_peaks=2):
        for _ in range(num_peaks):
            self.create_job_center()
        self.update_sugar_landscape()

    def create_job_center(self):
        x, y = np.random.randint(0, self.width), np.random.randint(0, self.height)
        duration = np.random.randint(*self.params['job_center_duration'])
        self.job_centers.append({
            'x': x, 'y': y,
            'duration': duration,
            'max_sugar': self.params['max_sugar']
        })

    def update_sugar_landscape(self):
        self.sugar = np.zeros((self.height, self.width))
        for center in self.job_centers:
            x_grid, y_grid = np.meshgrid(np.arange(self.width), np.arange(self.height))
            distance = np.sqrt((x_grid - center['x']) ** 2 + (y_grid - center['y']) ** 2)
            sugar_level = center['max_sugar'] * np.exp(-distance ** 2 / (2 * self.params['sugar_peak_spread'] ** 2))
            self.sugar += sugar_level
        self.sugar = np.clip(self.sugar, 0, self.params['max_sugar'])
        self.sugar = np.round(self.sugar).astype(int)

    def initialize_agents(self):
        agents = []
        available_positions = set((x, y) for x in range(self.width) for y in range(self.height))
        for i in range(self.num_agents):
            if not available_positions:
                break
            x, y = available_positions.pop()
            agents.append(self.create_agent(i, x, y))
        return agents

    def create_agent(self, id, x, y):
        return {
            'id': id,
            'x': x,
            'y': y,
            'sugar': np.random.randint(20, 50),
            'metabolism': np.random.randint(1, 3),
            'vision': self.params['vision_range'],
            'messages': deque(maxlen=self.max_messages),
            'destination': None,
            'memory': deque(maxlen=10),
            'path': [],
            'age': 0
        }

    def get_state(self, agent):
        x, y = agent['x'], agent['y']
        sugar = agent['sugar'] / 100  # Normalize sugar level
        metabolism = agent['metabolism'] / 5  # Normalize metabolism
        vision = agent['vision'] / 5  # Normalize vision

        # Extract sugar levels within vision range
        vision_range = agent['vision']
        y_min = max(0, y - vision_range)
        y_max = min(self.height, y + vision_range + 1)
        x_min = max(0, x - vision_range)
        x_max = min(self.width, x + vision_range + 1)
        sugar_map = self.sugar[y_min:y_max, x_min:x_max]

        # Pad the sugar map to a fixed size
        expected_size = (2 * vision_range + 1, 2 * vision_range + 1)
        padded_sugar_map = np.zeros(expected_size)
        y_offset = y_min - (y - vision_range)
        x_offset = x_min - (x - vision_range)
        y_start = y_offset
        y_end = y_start + sugar_map.shape[0]
        x_start = x_offset
        x_end = x_start + sugar_map.shape[1]
        padded_sugar_map[y_start:y_end, x_start:x_end] = sugar_map
        sugar_map_flat = padded_sugar_map.flatten() / self.params['max_sugar']  # Normalize sugar levels

        # Encode messages
        N = self.max_messages  # Number of messages to encode
        messages = list(agent['messages'])[-N:]  # Get the last N messages
        message_features = []
        for msg in messages:
            # Normalize message coordinates relative to grid size
            msg_x = msg['x'] / self.width
            msg_y = msg['y'] / self.height
            msg_sugar = msg['sugar_amount'] / self.params['max_sugar']
            message_features.extend([msg_x, msg_y, msg_sugar])
        # Pad remaining messages with zeros if fewer than N
        while len(message_features) < 3 * N:
            message_features.extend([0.0, 0.0, 0.0])

        state = np.concatenate((
            [x / self.width, y / self.height, sugar, metabolism, vision],
            sugar_map_flat,
            message_features
        ))
        return state

    def select_action(self, state, valid_actions):
        if random.random() < self.epsilon:
            return random.choice(valid_actions)
        else:
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            with torch.no_grad():
                q_values = self.q_network(state_tensor)
            # Mask invalid actions by setting their Q-values to -inf
            mask = torch.full((self.action_size,), -np.inf)
            mask[valid_actions] = 0
            masked_q = q_values + torch.FloatTensor(mask)
            return masked_q.argmax().item()

    def get_valid_actions(self, agent):
        actions = []
        x, y = agent['x'], agent['y']
        possible_moves = {
            0: (x, y - 1),  # Up
            1: (x, y + 1),  # Down
            2: (x - 1, y),  # Left
            3: (x + 1, y),  # Right
            4: (x, y)       # Stay
        }
        for action, (nx, ny) in possible_moves.items():
            if 0 <= nx < self.width and 0 <= ny < self.height:
                if (nx, ny) not in self.agent_positions or (nx, ny) == (x, y):
                    actions.append(action)
        return actions

    def move_agent(self, agent, action):
        x, y = agent['x'], agent['y']
        possible_moves = {
            0: (x, y - 1),  # Up
            1: (x, y + 1),  # Down
            2: (x - 1, y),  # Left
            3: (x + 1, y),  # Right
            4: (x, y)       # Stay
        }
        nx, ny = possible_moves[action]
        if (0 <= nx < self.width and 0 <= ny < self.height and
                ((nx, ny) not in self.agent_positions or (nx, ny) == (x, y))):
            self.agent_positions.remove((x, y))
            agent['x'], agent['y'] = nx, ny
            agent['path'].append((agent['x'], agent['y']))
            self.agent_positions.add((nx, ny))

    def collect_sugar_and_update_agent(self, agent):
        collected_sugar = self.sugar[agent['y'], agent['x']]
        agent['sugar'] += collected_sugar
        self.sugar[agent['y'], agent['x']] = 0
        agent['sugar'] -= agent['metabolism']
        agent['age'] += 1

    def broadcast_messages(self):
        if not self.agents:
            return  # No agents to broadcast

        positions = np.array([[agent['x'], agent['y']] for agent in self.agents])
        tree = cKDTree(positions)

        for i, agent in enumerate(self.agents):
            # Identify visible sugar peaks
            visible_sugar = self.get_visible_sugar(agent)
            sugar_locations = np.argwhere(visible_sugar > 0)
            messages = []
            for loc in sugar_locations:
                msg_x = agent['x'] + loc[1] - agent['vision']
                msg_y = agent['y'] + loc[0] - agent['vision']
                # Ensure message coordinates are within grid
                msg_x = int(np.clip(msg_x, 0, self.width - 1))
                msg_y = int(np.clip(msg_y, 0, self.height - 1))
                msg = {
                    'sender_id': agent['id'],
                    'timestep': self.timestep,
                    'sugar_amount': self.sugar[msg_y, msg_x],
                    'x': msg_x,
                    'y': msg_y
                }
                messages.append(msg)

            # Broadcast to neighbors within broadcast_radius
            radius = 5  # Fixed broadcast radius
            neighbors = tree.query_ball_point([agent['x'], agent['y']], radius)
            for neighbor_idx in neighbors:
                if neighbor_idx != i:
                    for msg in messages:
                        self.agents[neighbor_idx]['messages'].append(msg)

    def get_visible_sugar(self, agent):
        x, y = agent['x'], agent['y']
        vision = agent['vision']
        y_min = max(0, y - vision)
        y_max = min(self.height, y + vision + 1)
        x_min = max(0, x - vision)
        x_max = min(self.width, x + vision + 1)
        visible_area = self.sugar[y_min:y_max, x_min:x_max]
        return visible_area

    def step(self):
        # Update job centers and sugar landscape
        for center in self.job_centers:
            center['duration'] -= 1
        self.job_centers = [center for center in self.job_centers if center['duration'] > 0]
        if np.random.random() < self.params['sugar_peak_frequency']:
            self.create_job_center()
        self.update_sugar_landscape()

        # Update epsilon
        self.epsilon = max(self.epsilon_end, self.epsilon_start - self.timestep / self.epsilon_decay)

        # Broadcast messages
        self.broadcast_messages()

        # For each agent, select action and collect experience
        for agent in self.agents:
            state = self.get_state(agent)
            valid_actions = self.get_valid_actions(agent)
            if not valid_actions:
                continue  # Skip if no valid actions
            action = self.select_action(state, valid_actions)
            prev_sugar = agent['sugar']
            self.move_agent(agent, action)
            self.collect_sugar_and_update_agent(agent)
            next_state = self.get_state(agent)
            reward = agent['sugar'] - prev_sugar  # Reward is the change in sugar
            done = agent['sugar'] <= 0
            self.replay_buffer.append((state, action, reward, next_state, done))

            if len(self.replay_buffer) >= self.batch_size:
                loss = self.train_dqn()
                # Print progress every 100 episodes
                if (self.episode + 1) % 100 == 0:
                    print(f"Episode: {self.episode + 1}, Timestep: {self.timestep}, Loss: {loss:.4f}, Epsilon: {self.epsilon:.4f}")
                    # Print a summary of weights (e.g., mean and std of first layer)
                    first_layer_weights = self.q_network.fc1.weight.data.numpy()
                    print(f"First Layer Weights - Mean: {first_layer_weights.mean():.4f}, Std: {first_layer_weights.std():.4f}")

        # Handle agent death
        alive_agents = []
        for agent in self.agents:
            if agent['sugar'] <= 0:
                self.dead_agents.append({'x': agent['x'], 'y': agent['y'], 'death_time': self.timestep})
                self.agent_positions.remove((agent['x'], agent['y']))
            else:
                alive_agents.append(agent)
        self.agents = alive_agents

        # Replenish agents
        self.replenish_agents()

        self.dead_agents = [agent for agent in self.dead_agents if self.timestep - agent['death_time'] <= 5]

        self.collect_data()
        self.timestep += 1

        # Save model every 100 episodes
        if (self.episode + 1) % 100 == 0:
            torch.save(self.q_network.state_dict(), f'dqn_q_network_episode_{self.episode + 1}.pth')
            print(f"Model saved at episode {self.episode + 1}")

    def train_dqn(self):
        experiences = random.sample(self.replay_buffer, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*experiences)

        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions).unsqueeze(1)
        rewards = torch.FloatTensor(rewards).unsqueeze(1)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(dones).unsqueeze(1)

        # Current Q-values
        current_q = self.q_network(states).gather(1, actions)

        # Target Q-values
        with torch.no_grad():
            max_next_q = self.target_network(next_states).max(1)[0].unsqueeze(1)
            target_q = rewards + (self.gamma * max_next_q * (1 - dones))

        # Compute loss
        loss = self.criterion(current_q, target_q)

        # Optimize the Q-network
        self.optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.q_network.parameters(), 1.0)
        self.optimizer.step()

        return loss.item()

    def replenish_agents(self):
        while len(self.agents) < self.num_agents:
            x, y = random.randint(0, self.width - 1), random.randint(0, self.height - 1)
            if (x, y) not in self.agent_positions:
                agent_id = max([agent['id'] for agent in self.agents] + [0]) + 1
                new_agent = self.create_agent(agent_id, x, y)
                self.agent_positions.add((x, y))
                self.agents.append(new_agent)

    def collect_data(self):
        population = len(self.agents)
        total_wealth = sum(agent['sugar'] for agent in self.agents)
        average_wealth = total_wealth / population if population > 0 else 0

        self.population_history.append(population)
        self.average_wealth_history.append(average_wealth)
        self.gini_coefficient_history.append(self.calculate_gini_coefficient())

    def calculate_gini_coefficient(self):
        if not self.agents:
            return 0
        wealth_values = sorted(agent['sugar'] for agent in self.agents)
        cumulative_wealth = np.cumsum(wealth_values)
        n = len(wealth_values)
        gini = (n + 1 - 2 * np.sum(cumulative_wealth) / cumulative_wealth[-1]) / n
        return gini

    def run_training(self, total_episodes=1000, max_timesteps=1000):
        for episode in range(total_episodes):
            self.episode = episode
            for _ in range(max_timesteps):
                self.step()
            print(f"Completed Episode: {episode + 1}")

        # Save the final model
        torch.save(self.q_network.state_dict(), 'dqn_q_network_final.pth')
        torch.save(self.target_network.state_dict(), 'dqn_target_network_final.pth')
        np.save('dqn_population.npy', self.population_history)
        np.save('dqn_average_wealth.npy', self.average_wealth_history)
        np.save('dqn_gini_coefficient.npy', self.gini_coefficient_history)

    def plot_results(self):
        plt.figure(figsize=(15, 5))

        plt.subplot(131)
        plt.plot(self.population_history)
        plt.title('Population over Time (DQN)')
        plt.xlabel('Episode')
        plt.ylabel('Population')

        plt.subplot(132)
        plt.plot(self.average_wealth_history)
        plt.title('Average Wealth over Time (DQN)')
        plt.xlabel('Episode')
        plt.ylabel('Average Wealth')

        plt.subplot(133)
        plt.plot(self.gini_coefficient_history)
        plt.title('Gini Coefficient over Time (DQN)')
        plt.xlabel('Episode')
        plt.ylabel('Gini Coefficient')

        plt.tight_layout()
        plt.show()

# Example usage without visualization
if __name__ == "__main__":
    TOTAL_EPISODES = 1000
    MAX_TIMESTEPS = 1000
    env = SugarscapeEnvironmentDQN(
        width=30,
        height=30,
        num_agents=400,
        cell_size=10,
        seed=42,
        max_messages=5,
        max_timesteps=MAX_TIMESTEPS,
        visualize=False  # Disable visualization during training
    )
    env.run_training(total_episodes=TOTAL_EPISODES, max_timesteps=MAX_TIMESTEPS)
    env.plot_results()
    print("\nTraining completed and results saved.")


AttributeError: 'SugarscapeEnvironmentDQN' object has no attribute 'timestep'