In [14]:
!pip3 install torch torchvision torchaudio
!pip3 install ale_py gymnasium
!pip3 install opencv-python
!pip3 install pympler
!pip3 install logging



In [15]:
import os
from collections import defaultdict, deque
import random
import sys
from pympler import asizeof
import gc
import time  # Add for timing

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib.patches import Patch
import numpy as np
import cv2

import ale_py
import gymnasium as gym
from gymnasium.vector import SyncVectorEnv  # Correct import for gymnasium

import torch
import torch.nn.functional as F
from torch import nn, optim
from torch.utils.data import DataLoader

import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from tqdm import tqdm
import logging

In [16]:
gym.register_envs(ale_py)

# ===== CONFIGURATION =====
# Add parallel environment settings
N_ENVS = 8  # Number of parallel environments
N_STEPS = 100  # Collect 100 steps before training
N_UPDATES = 4  # Number of training updates after collecting steps

# Image cropping parameters
CROP_TOP = 34
CROP_BOTTOM = 16
CROP_LEFT = 0
CROP_RIGHT = 0
RESIZE_WIDTH = 84
RESIZE_HEIGHT = 84

# Training hyperparameters
LEARNING_RATE = 0.1
N_EPISODES = 100
START_EPSILON = 0.7
FINAL_EPSILON = 0.1
EPSILON_DECAY = (START_EPSILON - FINAL_EPSILON) / (N_EPISODES * 1000)  # Decay per step
DISCOUNT_FACTOR = 0.99
MEMORY_CAPACITY = 1000000
MEMORY_FILL_SIZE = 50000
MINIBATCH_SIZE = 8192
TARGET_UPDATE_FREQ = 1000
MAX_STEPS_PER_EPISODE = 5000
REWARD_CLIP = True
MODEL_FILE = "dqn_pong"
LOAD_MODEL = "dqn_pong_best.pth"  # or False
# =========================

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Create vectorized environment
def make_env():
    env = gym.make("ALE/Pong-v5")
    return env

env_fns = [make_env for _ in range(N_ENVS)]
vec_env = SyncVectorEnv(env_fns)
print(f"Created vectorized environment with {N_ENVS} parallel instances")

Using device: cpu
Created vectorized environment with 8 parallel instances


In [17]:
# preprocessing image


# preprocess image demonstration
"""
def preprocess_and_show_steps(obs):  # input: 210x160x3 RGB
    fig, axs = plt.subplots(1, 4, figsize=(18, 5))

    # Step 1: Show original RGB image
    axs[0].imshow(obs)
    axs[0].set_title("Original RGB (210x160)")
    axs[0].axis('off')

    # Step 2: Convert to grayscale
    greyscaled = np.dot(obs[..., :3], [0.299, 0.587, 0.114]).astype(np.uint8)
    axs[1].imshow(greyscaled, cmap='gray')
    axs[1].set_title("Grayscale")
    axs[1].axis('off')

    # Step 3: Crop image vertically (rows 95 to 195)
    cropped = greyscaled[95:195, :]
    axs[2].imshow(cropped, cmap='gray')
    axs[2].set_title("Cropped (95:195)")
    axs[2].axis('off')

    # Step 4: Resize to 100x50 using nearest neighbor
    resized = cv2.resize(cropped, dsize=(50, 25), interpolation=cv2.INTER_NEAREST)
    axs[3].imshow(resized, cmap='gray')
    axs[3].set_title("Resized to 100x50")
    axs[3].axis('off')

    plt.tight_layout()
    plt.show()

    return resized

# Run and visualize preprocessing steps
obs, info = env.reset()
processed = preprocess_and_show_steps(obs)
"""

def preprocess(obs,
               crop_top=CROP_TOP,
               crop_bottom=CROP_BOTTOM,
               crop_left=CROP_LEFT,
               crop_right=CROP_RIGHT,
               resize_width=RESIZE_WIDTH,
               resize_height=RESIZE_HEIGHT):
    # Convert to grayscale
    if len(obs.shape) == 3 and obs.shape[2] == 3:
        greyscaled = np.dot(obs[..., :3], [0.299, 0.587, 0.114])
    else:
        greyscaled = obs

    # Get image dimensions
    height, width = greyscaled.shape

    # Calculate crop boundaries
    top_bound = crop_top
    bottom_bound = height - crop_bottom
    left_bound = crop_left
    right_bound = width - crop_right

    # Perform cropping
    cropped = greyscaled[top_bound:bottom_bound, left_bound:right_bound]

    # Resize
    resized = cv2.resize(cropped, (resize_width, resize_height),
                         interpolation=cv2.INTER_NEAREST)

    return resized

In [18]:
# defining neural network
# based on this architecture: https://arxiv.org/pdf/1312.5602
# code written by Claude Sonnet 4

class DQN(nn.Module):
    def __init__(self, num_actions, in_channels=4):
        super(DQN, self).__init__()

        # Convolutional layers (standard DQN architecture)
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)

        # Dynamically calculate linear layer size
        with torch.no_grad():
            dummy = torch.zeros(1, in_channels, RESIZE_HEIGHT, RESIZE_WIDTH)
            dummy = F.relu(self.conv1(dummy))
            dummy = F.relu(self.conv2(dummy))
            dummy = F.relu(self.conv3(dummy))
            self.linear_size = dummy.view(1, -1).size(1)

        self.fc1 = nn.Linear(self.linear_size, 512)
        self.fc2 = nn.Linear(512, num_actions)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)  # Flatten
        x = F.relu(self.fc1(x))
        return self.fc2(x)

# example input: batch of 32 preprocessed frames
# batch_size = 32
# input_tensor = torch.randn(batch_size, 4, 25, 50) # Corrected input shape

# forward pass
# model = DQN(env.action_space.n)
# q_values = model(input_tensor)
# print(f"Input shape: {input_tensor.shape}")
# print(f"Output Q-values shape: {q_values.shape}")

In [19]:
class ReplayMemory:
    def __init__(self, capacity, state_shape):
        self.capacity = capacity
        self.state_shape = state_shape
        # Pre-allocate contiguous arrays
        self.states = np.zeros((capacity, *state_shape), dtype=np.uint8)
        self.next_states = np.zeros((capacity, *state_shape), dtype=np.uint8)
        self.actions = np.zeros(capacity, dtype=np.int32)
        self.rewards = np.zeros(capacity, dtype=np.float16)  # Reduce precision
        self.dones = np.zeros(capacity, dtype=np.bool_)
        self.position = 0
        self.size = 0

    def push(self, state, action, reward, next_state, done):
        self.states[self.position] = state
        self.next_states[self.position] = next_state
        self.actions[self.position] = action
        self.rewards[self.position] = reward
        self.dones[self.position] = done

        self.position = (self.position + 1) % self.capacity
        self.size = min(self.size + 1, self.capacity)

    def sample(self, batch_size):
        indices = np.random.choice(self.size, batch_size, replace=False)
        return (
            self.states[indices],
            self.actions[indices],
            self.rewards[indices],
            self.next_states[indices],
            self.dones[indices]
        )

    def __len__(self):
        return self.size

    def memory_usage_gb(self):
        total_bytes = (self.states.nbytes + self.next_states.nbytes +
                       self.actions.nbytes + self.rewards.nbytes +
                       self.dones.nbytes)
        return total_bytes / (1024 ** 3)

In [20]:
# Initialize replay memory
state_shape = (4, RESIZE_HEIGHT, RESIZE_WIDTH)
replay_memory = ReplayMemory(MEMORY_CAPACITY, state_shape)

# Initialize networks
online_net = DQN(gym.make("ALE/Pong-v5").action_space.n).to(device)
target_net = DQN(gym.make("ALE/Pong-v5").action_space.n).to(device)
target_net.load_state_dict(online_net.state_dict())
target_net.eval()

# === Load saved weights to continue training ===
if LOAD_MODEL:
    online_net.load_state_dict(torch.load(LOAD_MODEL, map_location=device))
    target_net.load_state_dict(online_net.state_dict())

# Optimizer
optimizer = optim.RMSprop(
    online_net.parameters(),
    lr=LEARNING_RATE,
    alpha=0.95,
    momentum=0.95,
    eps=0.01
)

In [21]:
class VectorAgent:
    def __init__(self, num_envs, initial_epsilon, epsilon_decay, final_epsilon, discount_factor):
        self.num_envs = num_envs
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon
        self.discount_factor = discount_factor

    def get_actions(self, states, model):
        """Get actions for all environments in a batch"""
        # Convert states to tensor
        states_tensor = torch.tensor(states, dtype=torch.float32, device=device) / 255.0

        with torch.no_grad():
            q_values = model(states_tensor)

        # Epsilon-greedy for each environment
        actions = q_values.argmax(1).cpu().numpy()
        rand_mask = np.random.rand(self.num_envs) < self.epsilon
        actions[rand_mask] = np.array([vec_env.single_action_space.sample() for _ in range(self.num_envs)])[rand_mask]

        return actions

    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon - self.epsilon_decay)

In [None]:
# Initialize agent
agent = VectorAgent(
    num_envs=N_ENVS,
    initial_epsilon=START_EPSILON,
    epsilon_decay=EPSILON_DECAY,
    final_epsilon=FINAL_EPSILON,
    discount_factor=DISCOUNT_FACTOR,
)

# Training setup
total_steps = 0
episode_rewards = []
episode_losses = []
episode_steps = []
best_reward = -float('inf')

# Initialize frame buffers for each environment
frame_buffers = [deque(maxlen=4) for _ in range(N_ENVS)]
current_states = np.zeros((N_ENVS, 4, RESIZE_HEIGHT, RESIZE_WIDTH))

# Create individual environments instead of vectorized env
print("Creating individual environments...")
envs = [make_env() for _ in range(N_ENVS)]

def reset_env(i):
    """Reset a single environment"""
    obs, _ = envs[i].reset()
    frame = preprocess(obs)
    frame_buffers[i].clear()
    for _ in range(4):
        frame_buffers[i].append(frame)
    return np.stack(frame_buffers[i], axis=0)

# Initialize all environments
print("Initializing parallel environments...")
for i in range(N_ENVS):
    current_states[i] = reset_env(i)

# Pre-fill replay memory 
print(f"Pre-filling replay memory to {MEMORY_FILL_SIZE} transitions...")
pbar = tqdm(total=MEMORY_FILL_SIZE, desc="Pre-filling")
episodes_completed = 0

# Track which environments are done
done_flags = [False] * N_ENVS

while len(replay_memory) < MEMORY_FILL_SIZE:
    # Prepare actions
    actions = []
    for i in range(N_ENVS):
        if done_flags[i]:
            # Environment is done, we'll reset it below
            actions.append(0)  # Dummy action, will be replaced
        else:
            # Get random action
            actions.append(envs[i].action_space.sample())
    
    # Step all environments
    next_obs_list = []
    rewards_list = []
    dones_list = []
    truncateds_list = []
    
    for i in range(N_ENVS):
        if done_flags[i]:
            # Reset done environments
            obs, _ = envs[i].reset()
            frame = preprocess(obs)
            frame_buffers[i].clear()
            for _ in range(4):
                frame_buffers[i].append(frame)
            next_state = np.stack(frame_buffers[i], axis=0)
            next_obs_list.append(obs)
            rewards_list.append(0)
            dones_list.append(False)
            truncateds_list.append(False)
            done_flags[i] = False
        else:
            # Step active environments
            next_obs, reward, done, truncated, info = envs[i].step(actions[i])
            next_obs_list.append(next_obs)
            rewards_list.append(reward)
            dones_list.append(done)
            truncateds_list.append(truncated)
    
    # Process each environment
    next_states = np.zeros_like(current_states)
    for i in range(N_ENVS):
        next_frame = preprocess(next_obs_list[i])
        frame_buffers[i].append(next_frame)
        next_states[i] = np.stack(frame_buffers[i], axis=0)
        
        if REWARD_CLIP:
            rewards_list[i] = np.clip(rewards_list[i], -1, 1)
        
        # Only store transitions from active environments (not those that were just reset)
        if not done_flags[i]:
            replay_memory.push(
                current_states[i].copy(),
                actions[i],
                rewards_list[i],
                next_states[i].copy(),
                dones_list[i] or truncateds_list[i]
            )
        
        # Check if environment is done
        if dones_list[i] or truncateds_list[i]:
            done_flags[i] = True
            episodes_completed += 1
    
    current_states = next_states
    pbar.update(N_ENVS)
    pbar.set_postfix({"Memory": len(replay_memory), "Episodes": episodes_completed})

pbar.close()
print(f"Replay memory filled with {len(replay_memory)} transitions")

Creating individual environments...
Initializing parallel environments...
Pre-filling replay memory to 50000 transitions...


Pre-filling:  32%|▎| 16008/50000 [00:13<00:27, 1231.36it/s, Memory=16008, Episod

In [12]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("training.log"),
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)

# Training loop with parallel environments - LOGGING VERSION
logger.info("Starting training with parallel environments...")
total_updates = 0
start_time = time.time()
episode_rewards = []      # Track all completed episode rewards
episode_steps_list = []   # Track all completed episode steps
episode_losses = []       # Track losses per training update
best_reward = -float('inf')

# Keep the episode progress bar but remove nested bars
episode_pbar = tqdm(range(N_EPISODES), desc="Training Episodes")

for episode in episode_pbar:
    # Log episode start
    logger.info(f"Starting episode {episode}/{N_EPISODES}")
    
    # Reset all environments at the start of each episode
    obs, _ = vec_env.reset()
    # Reinitialize frame buffers and current states
    frame_buffers = [deque(maxlen=4) for _ in range(N_ENVS)]
    current_states = np.zeros((N_ENVS, 4, RESIZE_HEIGHT, RESIZE_WIDTH))
    for i in range(N_ENVS):
        frame = preprocess(obs[i])
        for _ in range(4):
            frame_buffers[i].append(frame)
        current_states[i] = np.stack(frame_buffers[i], axis=0)

    # Per-environment tracking
    episode_rewards_current = np.zeros(N_ENVS)   # Rewards for current episode in each env
    episode_steps_current = np.zeros(N_ENVS)     # Steps for current episode in each env

    # Collect N_STEPS from all environments - no nested progress bar
    episode_start_time = time.time()
    step_losses = []
    
    for step in range(N_STEPS):
        # Get actions for all environments
        actions = agent.get_actions(current_states, online_net)

        # Step all environments
        next_obs, rewards, dones, truncateds, infos = vec_env.step(actions)

        # Process each environment
        next_states = np.zeros_like(current_states)
        for i in range(N_ENVS):
            # Preprocess frame
            next_frame = preprocess(next_obs[i])

            # Update frame buffer
            frame_buffers[i].append(next_frame)
            next_states[i] = np.stack(frame_buffers[i], axis=0)

            # Clip reward
            if REWARD_CLIP:
                rewards[i] = np.clip(rewards[i], -10, 10)

            # Update tracking
            episode_rewards_current[i] += rewards[i]
            episode_steps_current[i] += 1

            # Store transition
            replay_memory.push(
                current_states[i].copy(),
                actions[i],
                rewards[i],
                next_states[i].copy(),
                dones[i] or truncateds[i]
            )

            # Reset if done
            if dones[i] or truncateds[i]:
                # Record completed episode
                episode_rewards.append(episode_rewards_current[i])
                episode_steps_list.append(episode_steps_current[i])

                # Log completed episode
                logger.info(f"Env {i} completed episode with reward: {episode_rewards_current[i]:.2f}")

                # Reset tracking
                episode_rewards_current[i] = 0
                episode_steps_current[i] = 0

                # Reset environment
                vec_env.reset_async(indices=[i])
                reset_obs, _ = vec_env.reset_wait(indices=[i])
                frame = preprocess(reset_obs[0])
                frame_buffers[i].clear()
                for _ in range(4):
                    frame_buffers[i].append(frame)
                next_states[i] = np.stack(frame_buffers[i], axis=0)

        current_states = next_states
        total_steps += N_ENVS

        # Log every 100 steps
        if step % 100 == 0:
            avg_reward = np.mean(episode_rewards[-10:]) if episode_rewards else 0
            logger.info(f"Step {step}/{N_STEPS} - Total Steps: {total_steps} - Avg Reward: {avg_reward:.2f}")

    # Training updates - no nested progress bar
    if len(replay_memory) > MINIBATCH_SIZE:
        step_losses = []
        for update in range(N_UPDATES):
            batch_data = replay_memory.sample(MINIBATCH_SIZE)
            states, actions, rewards, next_states, dones = batch_data

            # Convert to tensors
            states_tensor = torch.as_tensor(states, device=device, dtype=torch.float32) / 255.0
            next_states_tensor = torch.as_tensor(next_states, device=device, dtype=torch.float32) / 255.0
            actions_tensor = torch.as_tensor(actions, device=device, dtype=torch.long)
            rewards_tensor = torch.as_tensor(rewards, device=device, dtype=torch.float32)
            dones_tensor = torch.as_tensor(dones, device=device, dtype=torch.float32)

            # Compute Q-values
            current_q = online_net(states_tensor).gather(1, actions_tensor.unsqueeze(1)).squeeze(1)

            # Compute target Q-values
            with torch.no_grad():
                next_q = target_net(next_states_tensor).max(1)[0]
                target_q = rewards_tensor + DISCOUNT_FACTOR * next_q * (1 - dones_tensor)

            # Compute loss
            loss = F.mse_loss(current_q, target_q)
            step_losses.append(loss.item())

            # Optimize
            optimizer.zero_grad()
            loss.backward()

            # Gradient clipping
            for param in online_net.parameters():
                if param.grad is not None:
                    param.grad.data.clamp_(-1, 1)

            optimizer.step()

            total_updates += 1

            # Log every 5 updates
            if update % 5 == 0:
                logger.info(f"Update {update}/{N_UPDATES} - Loss: {loss.item():.4f}")

        avg_loss = sum(step_losses) / len(step_losses) if step_losses else 0
        episode_losses.append(avg_loss)
        logger.info(f"Training updates completed - Avg Loss: {avg_loss:.4f}")
    else:
        episode_losses.append(0)
        logger.info("Skipping training updates (insufficient samples)")

    # Update target network
    if total_updates % (TARGET_UPDATE_FREQ // N_UPDATES) == 0:
        target_net.load_state_dict(online_net.state_dict())
        logger.info("Target network updated")

    # Decay epsilon
    for _ in range(N_STEPS * N_ENVS):
        agent.decay_epsilon()

    # Calculate metrics
    avg_reward = np.mean(episode_rewards[-50:]) if episode_rewards else 0
    avg_steps = np.mean(episode_steps_list[-50:]) if episode_steps_list else 0
    elapsed = time.time() - start_time
    steps_per_sec = total_steps / elapsed if elapsed > 0 else 0
    episode_time = time.time() - episode_start_time

    # Update episode progress bar
    episode_pbar.set_postfix({
        "Avg Reward": f"{avg_reward:.2f}",
        "Epsilon": f"{agent.epsilon:.4f}",
        "Avg Loss": f"{episode_losses[-1]:.4f}",
        "Speed": f"{steps_per_sec:.1f} steps/s"
    })

    # Log episode completion
    logger.info(f"Episode {episode} completed in {episode_time:.2f} seconds")
    logger.info(f"  Total Steps: {total_steps}")
    logger.info(f"  Avg Reward (50): {avg_reward:.2f}")
    logger.info(f"  Epsilon: {agent.epsilon:.4f}")
    logger.info(f"  Avg Loss: {episode_losses[-1]:.4f}")
    logger.info(f"  Speed: {steps_per_sec:.1f} steps/s")

    # Save best model
    if avg_reward > best_reward:
        best_reward = avg_reward
        torch.save(online_net.state_dict(), f"{MODEL_FILE}_best.pth")
        logger.info(f"New best model saved with reward: {avg_reward:.2f}")
        episode_pbar.write(f"New best model saved with reward: {avg_reward:.2f}")

# Final save and cleanup
torch.save(online_net.state_dict(), f"{MODEL_FILE}_final.pth")
episode_pbar.close()
logger.info("Training complete. Models saved.")

Starting training with parallel environments...


Training Episodes:   0%|                                | 0/100 [00:00<?, ?it/s]
Episode 0 Steps:   0%|                                  | 0/100 [00:00<?, ?it/s][A
Episode 0 Steps:   0%|  | 0/100 [00:00<?, ?it/s, Total Steps=1608, Avg Reward=0][A
Episode 0 Steps:   0%|  | 0/100 [00:00<?, ?it/s, Total Steps=1616, Avg Reward=0][A
Episode 0 Steps:   0%|  | 0/100 [00:00<?, ?it/s, Total Steps=1624, Avg Reward=0][A
Episode 0 Steps:   0%|  | 0/100 [00:00<?, ?it/s, Total Steps=1632, Avg Reward=0][A
Episode 0 Steps:   4%| | 4/100 [00:00<00:02, 37.76it/s, Total Steps=1632, Avg Re[A
Episode 0 Steps:   4%| | 4/100 [00:00<00:02, 37.76it/s, Total Steps=1640, Avg Re[A
Episode 0 Steps:   4%| | 4/100 [00:00<00:02, 37.76it/s, Total Steps=1648, Avg Re[A
Episode 0 Steps:   4%| | 4/100 [00:00<00:02, 37.76it/s, Total Steps=1656, Avg Re[A
Episode 0 Steps:   4%| | 4/100 [00:00<00:02, 37.76it/s, Total Steps=1664, Avg Re[A
Episode 0 Steps:   4%| | 4/100 [00:00<00:02, 37.76it/s, Total Steps=1672, Avg R

KeyboardInterrupt: 

In [None]:
# Plot results
plt.figure(figsize=(18, 5))

# Plot rewards
plt.subplot(1, 3, 1)
plt.plot(episode_rewards)
plt.title("Episode Rewards")
plt.xlabel("Episode")
plt.ylabel("Total Reward")

# Plot loss
plt.subplot(1, 3, 2)
plt.plot(episode_losses)
plt.title("Training Loss")
plt.xlabel("Episode")
plt.ylabel("Average Loss")

# Plot steps
plt.subplot(1, 3, 3)
plt.plot(episode_steps)
plt.title("Episode Steps")
plt.xlabel("Episode")
plt.ylabel("Steps Taken")

plt.tight_layout()
plt.savefig("training_results.png")
plt.show()
