In [2]:
# Install system dependencies
!apt-get update && apt-get install -y \
    python3-dev \
    swig \
    python3-pygame \
    libsdl2-dev \
    libjpeg-dev \
    zlib1g-dev

# Install Python packages
!pip install "pettingzoo[atari]==1.24.3" gymnasium[atari] numpy pygame

# Install and setup AutoROM
!pip install autorom
!AutoROM --accept-license

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,651 kB]
Get:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Get:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,654 kB]
Get:13 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 P

In [18]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque, namedtuple
import random
from pettingzoo.atari import pong_v3
import gymnasium as gym
import torchvision.transforms as T

# Define experience tuple structure
Experience = namedtuple('Experience', ['state', 'action', 'reward', 'next_state', 'done'])

class QNetwork(nn.Module):
    """Deep Q-Network architecture with CNN for processing Pong images"""
    def __init__(self, action_size, device="cpu"):
        super(QNetwork, self).__init__()

        # CNN layers
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )

        # Calculate the size of flattened features
        self.fc_input_dim = self._get_conv_output_dim()

        # Fully connected layers
        self.fc_layers = nn.Sequential(
            nn.Linear(self.fc_input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, action_size)
        )

        self.device = device
        self.to(device)

    def _get_conv_output_dim(self):
        # Helper function to calculate conv output dimensions
        x = torch.randn(1, 1, 210, 160)
        x = self.conv_layers(x)
        return int(np.prod(x.shape))

    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)
        return self.fc_layers(x)

class PongPreprocessor:
    """Preprocessor for Pong observations"""
    def __init__(self, device="cpu"):
        self.device = device
        self.transform = T.Compose([
            T.Grayscale(),
            T.Lambda(lambda x: x.to(device))
        ])

    def process(self, observation):
        if observation is None:
            # Return zero tensor if observation is None
            return torch.zeros((1, 1, 210, 160), device=self.device)

        # Convert numpy array to PIL Image if it's not already a tensor
        if isinstance(observation, np.ndarray):
            # Ensure the observation is in the correct format (H, W, C)
            if observation.ndim == 2:
                observation = np.expand_dims(observation, axis=-1)

            # Convert to float and normalize if needed
            if observation.dtype == np.uint8:
                observation = observation.astype(np.float32) / 255.0

            # Convert to tensor with correct shape (C, H, W)
            observation = torch.from_numpy(observation).permute(2, 0, 1)

        # Add batch dimension if not present
        if observation.dim() == 3:
            observation = observation.unsqueeze(0)

        # Apply remaining transforms
        observation = self.transform(observation)
        return observation

class ReplayBuffer:
    """Experience replay buffer for storing past experiences"""
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def add(self, state, action, reward, next_state, done):
        experience = Experience(state, action, reward, next_state, done)
        self.buffer.append(experience)

    def sample(self, batch_size):
        experiences = random.sample(self.buffer, batch_size)

        states = torch.cat([e.state for e in experiences])
        actions = torch.LongTensor([e.action for e in experiences])
        rewards = torch.FloatTensor([e.reward for e in experiences])
        next_states = torch.cat([e.next_state for e in experiences])
        dones = torch.FloatTensor([e.done for e in experiences])

        return states, actions, rewards, next_states, dones

    def __len__(self):
        return len(self.buffer)

class PongAgent:
    """Agent for training and playing Pong"""
    def __init__(self, action_size, device="cpu", learning_rate=1e-4):
        self.action_size = action_size
        self.device = device

        # Q-Networks
        self.q_network = QNetwork(action_size, device)
        self.target_network = QNetwork(action_size, device)
        self.target_network.load_state_dict(self.q_network.state_dict())

        self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate)
        self.memory = ReplayBuffer(capacity=100000)
        self.preprocessor = PongPreprocessor(device)

        # Training parameters
        self.batch_size = 32
        self.gamma = 0.99
        self.tau = 1e-3
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995

    def choose_action(self, observation, training=True):
        """Select action using epsilon-greedy policy"""
        if training and random.random() < self.epsilon:
            return random.randrange(self.action_size)

        with torch.no_grad():
            state = self.preprocessor.process(observation)
            action_values = self.q_network(state)
            return action_values.argmax().item()

    def learn(self):
        """Update Q-Network weights"""
        if len(self.memory) < self.batch_size:
            return

        states, actions, rewards, next_states, dones = self.memory.sample(self.batch_size)
        states = states.to(self.device)
        actions = actions.to(self.device)
        rewards = rewards.to(self.device)
        next_states = next_states.to(self.device)
        dones = dones.to(self.device)

        # Get max predicted Q values for next states from target model
        Q_targets_next = self.target_network(next_states).detach().max(1)[0]

        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.q_network(states).gather(1, actions.unsqueeze(1)).squeeze(1)

        # Compute loss and perform optimization
        loss = nn.MSELoss()(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Soft update target network
        self._update_target_network()

        # Update epsilon
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

    def _update_target_network(self):
        """Soft update of target network"""
        for target_param, local_param in zip(self.target_network.parameters(), self.q_network.parameters()):
            target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)

    def save(self, path):
        """Save model weights"""
        torch.save({
            'q_network_state_dict': self.q_network.state_dict(),
            'target_network_state_dict': self.target_network.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'epsilon': self.epsilon
        }, path)

    def load(self, path):
        """Load model weights"""
        if os.path.exists(path):
            checkpoint = torch.load(path)
            self.q_network.load_state_dict(checkpoint['q_network_state_dict'])
            self.target_network.load_state_dict(checkpoint['target_network_state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            self.epsilon = checkpoint['epsilon']

def train_pong_agent(episodes=1000, max_steps=1000, device="cpu"):
    """Train the Pong agent"""
    env = pong_v3.env()
    env.reset()

    # Initialize the environment and get the first agent
    agents = env.possible_agents
    if not agents:
        raise ValueError("No agents found in environment")

    first_agent = agents[0]
    action_size = env.action_space(first_agent).n

    # Initialize agent
    agent = PongAgent(action_size, device)
    scores = []

    for episode in range(episodes):
        env.reset()
        episode_reward = 0
        step_count = 0

        # Main training loop
        for agent_name in env.agent_iter():
            if step_count >= max_steps:
                break

            observation, reward, termination, truncation, info = env.last()
            done = termination or truncation

            # Process current state and choose action
            action = agent.choose_action(observation)
            env.step(action)

            # Get next state
            next_observation, next_reward, next_termination, next_truncation, next_info = env.last()

            # Process states and store in replay buffer
            state = agent.preprocessor.process(observation)
            next_state = agent.preprocessor.process(next_observation)

            # Store experience in replay memory
            agent.memory.add(state, action, reward, next_state, done)

            # Learn from experiences
            agent.learn()

            episode_reward += reward
            step_count += 1

            if done:
                break

        scores.append(episode_reward)

        # Print progress
        if (episode + 1) % 10 == 0:
            avg_score = np.mean(scores[-10:])
            print(f'Episode {episode+1} Average Score: {avg_score:.2f} Epsilon: {agent.epsilon:.2f}')

            # Save model periodically
            if (episode + 1) % 100 == 0:
                agent.save(f'pong_agent_episode_{episode+1}.pth')

    env.close()
    return agent, scores

In [19]:
    trained_agent, training_scores = train_pong_agent(10, 10)

    # Save the final model
    trained_agent.save('pong_agent_final.pth')

Episode 10 Average Score: 0.00 Epsilon: 0.71


In [8]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque, namedtuple
import random
from pettingzoo.atari import pong_v3
import gymnasium as gym

# Define experience tuple structure
Experience = namedtuple('Experience', ['state', 'action', 'reward', 'next_state', 'done'])

class QNetwork(nn.Module):
    """Deep Q-Network architecture"""
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(state_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, action_size)
        )

    def forward(self, x):
        return self.network(x)

class ReplayBuffer:
    """Experience replay buffer for storing past experiences"""
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def add(self, state, action, reward, next_state, done):
        experience = Experience(state, action, reward, next_state, done)
        self.buffer.append(experience)

    def sample(self, batch_size):
        experiences = random.sample(self.buffer, batch_size)

        states = torch.FloatTensor([e.state for e in experiences])
        actions = torch.LongTensor([e.action for e in experiences])
        rewards = torch.FloatTensor([e.reward for e in experiences])
        next_states = torch.FloatTensor([e.next_state for e in experiences])
        dones = torch.FloatTensor([e.done for e in experiences])

        return states, actions, rewards, next_states, dones

    def __len__(self):
        return len(self.buffer)

class PongAgent:
    """Agent for training and playing Pong"""
    def __init__(self, state_size, action_size, learning_rate=1e-4):
        self.state_size = state_size
        self.action_size = action_size

        # Q-Networks
        self.q_network = QNetwork(state_size, action_size)
        self.target_network = QNetwork(state_size, action_size)
        self.target_network.load_state_dict(self.q_network.state_dict())

        self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate)
        self.memory = ReplayBuffer(capacity=100000)

        # Training parameters
        self.batch_size = 64
        self.gamma = 0.99
        self.tau = 1e-3
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995

    def choose_action(self, state, training=True):
        """Select action using epsilon-greedy policy"""
        if training and random.random() < self.epsilon:
            return random.randrange(self.action_size)

        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            action_values = self.q_network(state_tensor)
            return action_values.argmax().item()

    def learn(self):
        """Update Q-Network weights"""
        if len(self.memory) < self.batch_size:
            return

        states, actions, rewards, next_states, dones = self.memory.sample(self.batch_size)

        # Get max predicted Q values for next states from target model
        Q_targets_next = self.target_network(next_states).detach().max(1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards.unsqueeze(1) + (self.gamma * Q_targets_next * (1 - dones.unsqueeze(1)))

        # Get expected Q values from local model
        Q_expected = self.q_network(states).gather(1, actions.unsqueeze(1))

        # Compute loss and perform optimization
        loss = nn.MSELoss()(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Soft update target network
        self._update_target_network()

        # Update epsilon
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

    def _update_target_network(self):
        """Soft update of target network"""
        for target_param, local_param in zip(self.target_network.parameters(), self.q_network.parameters()):
            target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)

    def save(self, path):
        """Save model weights"""
        torch.save({
            'q_network_state_dict': self.q_network.state_dict(),
            'target_network_state_dict': self.target_network.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
        }, path)

    def load(self, path):
        """Load model weights"""
        if os.path.exists(path):
            checkpoint = torch.load(path)
            self.q_network.load_state_dict(checkpoint['q_network_state_dict'])
            self.target_network.load_state_dict(checkpoint['target_network_state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

def train_pong_agent(episodes=1000, max_steps=1000):
    """Train the Pong agent"""
    env = pong_v3.env()
    env.reset()

    # Initialize the environment and get the first agent
    for agent in env.agent_iter():
        observation, reward, termination, truncation, info = env.last()

        # Get environment dimensions from the first agent's observation and action spaces
        state_size = env.observation_space(agent).shape[0]  # Corrected access
        action_size = env.action_space(agent).n  # Corrected access
        break

    agent = PongAgent(state_size, action_size)
    scores = []

    for episode in range(episodes):
        env.reset()
        episode_reward = 0
        step_count = 0

        # Main training loop
        for agent_name in env.agent_iter():
            if step_count >= max_steps:
                break

            observation, reward, termination, truncation, info = env.last()
            done = termination or truncation

            # Choose and perform action
            action = agent.choose_action(observation)
            env.step(action)

            # Get next state
            next_observation, next_reward, next_termination, next_truncation, next_info = env.last()

            # Store experience in replay memory
            agent.memory.add(observation, action, reward, next_observation, done)

            # Learn from experiences
            agent.learn()

            episode_reward += reward
            step_count += 1

            if done:
                break

        scores.append(episode_reward)

        # Print progress
        if (episode + 1) % 100 == 0:
            avg_score = np.mean(scores[-100:])
            print(f'Episode {episode+1} Average Score: {avg_score:.2f}')

            # Save model if performance is good
            if avg_score >= 195.0:
                agent.save('pong_agent.pth')
                print('Environment solved!')
                break

    env.close()
    return agent, scores

In [9]:
import numpy as np
from typing import Dict, List, Callable, Tuple
from pettingzoo import AECEnv
from pettingzoo.atari import pong_v3
import matplotlib.pyplot as plt
from collections import defaultdict
import pandas as pd
import seaborn as sns
from tqdm import tqdm

class PongEvaluator:
    """Comprehensive evaluation framework for Pong agents"""
    def __init__(
        self,
        env_creator: Callable[[], AECEnv],
        our_agent: Callable,
        opponent_policies: Dict[str, Callable],
        n_episodes: int = 100
    ):
        self.env_creator = env_creator
        self.our_agent = our_agent
        self.opponent_policies = opponent_policies
        self.n_episodes = n_episodes
        self.results = defaultdict(list)
        self.episode_histories = []

    def evaluate_against_all(self) -> Dict[str, Dict[str, float]]:
        """Evaluate our agent against all opponent policies"""
        all_metrics = {}

        for opponent_name, opponent_policy in self.opponent_policies.items():
            print(f"\nEvaluating against {opponent_name}...")
            env = MultiOpponentEnv(
                self.env_creator,
                {opponent_name: opponent_policy},
                "first_0"
            )

            metrics = self._run_evaluation(env, opponent_name)
            all_metrics[opponent_name] = metrics

        return all_metrics

    def _run_evaluation(self, env: MultiOpponentEnv, opponent_name: str) -> Dict[str, float]:
        """Run evaluation episodes against a single opponent"""
        episode_rewards = []
        win_count = 0
        draw_count = 0
        avg_episode_length = []

        for episode in tqdm(range(self.n_episodes)):
            state = env.reset()
            episode_reward = 0
            steps = 0

            done = False
            truncated = False

            while not (done or truncated):
                action = self.our_agent(state)
                next_state, reward, done, truncated, info = env.step(action)
                episode_reward += reward
                state = next_state
                steps += 1

            # Record metrics
            episode_rewards.append(episode_reward)
            avg_episode_length.append(steps)

            # Determine win/loss/draw
            if episode_reward > 0:
                win_count += 1
            elif episode_reward == 0:
                draw_count += 1

            # Store detailed episode history
            self.episode_histories.append({
                'opponent': opponent_name,
                'episode': episode,
                'reward': episode_reward,
                'steps': steps
            })

        # Calculate metrics
        metrics = {
            'mean_reward': np.mean(episode_rewards),
            'std_reward': np.std(episode_rewards),
            'win_rate': win_count / self.n_episodes,
            'draw_rate': draw_count / self.n_episodes,
            'avg_episode_length': np.mean(avg_episode_length)
        }

        return metrics

    def plot_results(self):
        """Generate comprehensive visualization of evaluation results"""
        results_df = pd.DataFrame(self.episode_histories)

        # Create figure with subplots
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

        # 1. Performance across opponents
        sns.boxplot(data=results_df, x='opponent', y='reward', ax=ax1)
        ax1.set_title('Reward Distribution by Opponent')
        ax1.set_xlabel('Opponent')
        ax1.set_ylabel('Reward')

        # 2. Win rates
        win_rates = []
        for opponent in results_df['opponent'].unique():
            opponent_data = results_df[results_df['opponent'] == opponent]
            win_rate = (opponent_data['reward'] > 0).mean()
            win_rates.append({'opponent': opponent, 'win_rate': win_rate})

        win_rates_df = pd.DataFrame(win_rates)
        sns.barplot(data=win_rates_df, x='opponent', y='win_rate', ax=ax2)
        ax2.set_title('Win Rate by Opponent')
        ax2.set_xlabel('Opponent')
        ax2.set_ylabel('Win Rate')

        # 3. Episode length distribution
        sns.boxplot(data=results_df, x='opponent', y='steps', ax=ax3)
        ax3.set_title('Episode Length Distribution')
        ax3.set_xlabel('Opponent')
        ax3.set_ylabel('Steps')

        # 4. Learning curve (if applicable)
        for opponent in results_df['opponent'].unique():
            opponent_data = results_df[results_df['opponent'] == opponent]
            rolling_mean = opponent_data['reward'].rolling(window=10).mean()
            ax4.plot(rolling_mean, label=opponent)

        ax4.set_title('Rolling Average Reward (window=10)')
        ax4.set_xlabel('Episode')
        ax4.set_ylabel('Average Reward')
        ax4.legend()

        plt.tight_layout()
        return fig

    def generate_report(self) -> str:
        """Generate a detailed performance report"""
        report = []
        report.append("Performance Report\n================\n")

        # Overall statistics
        total_episodes = len(self.episode_histories)
        unique_opponents = len(set(h['opponent'] for h in self.episode_histories))
        report.append(f"Total Episodes: {total_episodes}")
        report.append(f"Unique Opponents: {unique_opponents}\n")

        # Per-opponent statistics
        results_df = pd.DataFrame(self.episode_histories)
        for opponent in results_df['opponent'].unique():
            opponent_data = results_df[results_df['opponent'] == opponent]

            report.append(f"\nOpponent: {opponent}")
            report.append("-" * (len(opponent) + 10))
            report.append(f"Mean Reward: {opponent_data['reward'].mean():.2f}")
            report.append(f"Win Rate: {(opponent_data['reward'] > 0).mean():.2%}")
            report.append(f"Draw Rate: {(opponent_data['reward'] == 0).mean():.2%}")
            report.append(f"Average Episode Length: {opponent_data['steps'].mean():.1f}")
            report.append(f"Longest Episode: {opponent_data['steps'].max()}")
            report.append(f"Shortest Episode: {opponent_data['steps'].min()}\n")

        return "\n".join(report)

# Example opponent policies
def random_policy(observation):
    """Random action selection"""
    return np.random.randint(0, 3)  # Assuming 3 possible actions

def simple_policy(observation):
    """Simple rule-based policy"""
    # Add your simple policy logic here
    return 1  # Middle action

def create_evaluation_suite():
    """Create a standard evaluation suite with multiple opponents"""
    opponent_policies = {
        "Random": random_policy,
        "Simple": simple_policy,
        # Add more opponents as needed
    }

    return opponent_policies

# Usage example
if __name__ == "__main__":
    # Create opponents
    opponent_policies = create_evaluation_suite()

    # Initialize evaluator
    evaluator = PongEvaluator(
        env_creator=pong_v3.env,
        our_agent=lambda x: np.random.randint(0, 3),  # Replace with your agent
        opponent_policies=opponent_policies,
        n_episodes=100
    )

    # Run evaluation
    metrics = evaluator.evaluate_against_all()

    # Generate and save plots
    fig = evaluator.plot_results()
    plt.savefig('evaluation_results.png')

    # Generate report
    report = evaluator.generate_report()
    print(report)

NameError: name 'MultiOpponentEnv' is not defined