In [1]:
!pip3 install torch torchvision torchaudio
!pip3 install ale_py gymnasium
!pip3 install opencv-python
!pip3 install pympler



In [3]:
import os
from collections import defaultdict, deque
import random
import sys
from pympler import asizeof
import gc
import time  # Add for timing

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib.patches import Patch
import numpy as np
import cv2

import ale_py
import gymnasium as gym
from gymnasium.vector import SyncVectorEnv  # Correct import for gymnasium

import torch
import torch.nn.functional as F
from torch import nn, optim
from torch.utils.data import DataLoader

import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from tqdm import tqdm

In [4]:
gym.register_envs(ale_py)

# ===== CONFIGURATION =====
# Add parallel environment settings
N_ENVS = 8  # Number of parallel environments
N_STEPS = 100  # Collect 100 steps before training
N_UPDATES = 4  # Number of training updates after collecting steps

# Image cropping parameters
CROP_TOP = 34
CROP_BOTTOM = 16
CROP_LEFT = 0
CROP_RIGHT = 0
RESIZE_WIDTH = 84
RESIZE_HEIGHT = 84

# Training hyperparameters
LEARNING_RATE = 0.1
N_EPISODES = 100
START_EPSILON = 0.7
FINAL_EPSILON = 0.1
EPSILON_DECAY = (START_EPSILON - FINAL_EPSILON) / (N_EPISODES * 1000)  # Decay per step
DISCOUNT_FACTOR = 0.99
MEMORY_CAPACITY = 1000000
MEMORY_FILL_SIZE = 50000
MINIBATCH_SIZE = 8192
TARGET_UPDATE_FREQ = 1000
MAX_STEPS_PER_EPISODE = 5000
REWARD_CLIP = True
MODEL_FILE = "dqn_pong"
LOAD_MODEL = "dqn_pong_best.pth"  # or False
# =========================

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Create vectorized environment
def make_env():
    env = gym.make("ALE/Pong-v5")
    return env

env_fns = [make_env for _ in range(N_ENVS)]
vec_env = SyncVectorEnv(env_fns)
print(f"Created vectorized environment with {N_ENVS} parallel instances")

Using device: cpu


A.L.E: Arcade Learning Environment (version 0.11.2+ecc1138)
[Powered by Stella]


Created vectorized environment with 8 parallel instances


In [5]:
# preprocessing image


# preprocess image demonstration
"""
def preprocess_and_show_steps(obs):  # input: 210x160x3 RGB
    fig, axs = plt.subplots(1, 4, figsize=(18, 5))

    # Step 1: Show original RGB image
    axs[0].imshow(obs)
    axs[0].set_title("Original RGB (210x160)")
    axs[0].axis('off')

    # Step 2: Convert to grayscale
    greyscaled = np.dot(obs[..., :3], [0.299, 0.587, 0.114]).astype(np.uint8)
    axs[1].imshow(greyscaled, cmap='gray')
    axs[1].set_title("Grayscale")
    axs[1].axis('off')

    # Step 3: Crop image vertically (rows 95 to 195)
    cropped = greyscaled[95:195, :]
    axs[2].imshow(cropped, cmap='gray')
    axs[2].set_title("Cropped (95:195)")
    axs[2].axis('off')

    # Step 4: Resize to 100x50 using nearest neighbor
    resized = cv2.resize(cropped, dsize=(50, 25), interpolation=cv2.INTER_NEAREST)
    axs[3].imshow(resized, cmap='gray')
    axs[3].set_title("Resized to 100x50")
    axs[3].axis('off')

    plt.tight_layout()
    plt.show()

    return resized

# Run and visualize preprocessing steps
obs, info = env.reset()
processed = preprocess_and_show_steps(obs)
"""

def preprocess(obs,
               crop_top=CROP_TOP,
               crop_bottom=CROP_BOTTOM,
               crop_left=CROP_LEFT,
               crop_right=CROP_RIGHT,
               resize_width=RESIZE_WIDTH,
               resize_height=RESIZE_HEIGHT):
    # Convert to grayscale
    if len(obs.shape) == 3 and obs.shape[2] == 3:
        greyscaled = np.dot(obs[..., :3], [0.299, 0.587, 0.114])
    else:
        greyscaled = obs

    # Get image dimensions
    height, width = greyscaled.shape

    # Calculate crop boundaries
    top_bound = crop_top
    bottom_bound = height - crop_bottom
    left_bound = crop_left
    right_bound = width - crop_right

    # Perform cropping
    cropped = greyscaled[top_bound:bottom_bound, left_bound:right_bound]

    # Resize
    resized = cv2.resize(cropped, (resize_width, resize_height),
                         interpolation=cv2.INTER_NEAREST)

    return resized

In [6]:
# defining neural network
# based on this architecture: https://arxiv.org/pdf/1312.5602
# code written by Claude Sonnet 4

class DQN(nn.Module):
    def __init__(self, num_actions, in_channels=4):
        super(DQN, self).__init__()

        # Convolutional layers (standard DQN architecture)
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)

        # Dynamically calculate linear layer size
        with torch.no_grad():
            dummy = torch.zeros(1, in_channels, RESIZE_HEIGHT, RESIZE_WIDTH)
            dummy = F.relu(self.conv1(dummy))
            dummy = F.relu(self.conv2(dummy))
            dummy = F.relu(self.conv3(dummy))
            self.linear_size = dummy.view(1, -1).size(1)

        self.fc1 = nn.Linear(self.linear_size, 512)
        self.fc2 = nn.Linear(512, num_actions)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)  # Flatten
        x = F.relu(self.fc1(x))
        return self.fc2(x)

# example input: batch of 32 preprocessed frames
# batch_size = 32
# input_tensor = torch.randn(batch_size, 4, 25, 50) # Corrected input shape

# forward pass
# model = DQN(env.action_space.n)
# q_values = model(input_tensor)
# print(f"Input shape: {input_tensor.shape}")
# print(f"Output Q-values shape: {q_values.shape}")

In [7]:
class ReplayMemory:
    def __init__(self, capacity, state_shape):
        self.capacity = capacity
        self.state_shape = state_shape
        # Pre-allocate contiguous arrays
        self.states = np.zeros((capacity, *state_shape), dtype=np.uint8)
        self.next_states = np.zeros((capacity, *state_shape), dtype=np.uint8)
        self.actions = np.zeros(capacity, dtype=np.int32)
        self.rewards = np.zeros(capacity, dtype=np.float16)  # Reduce precision
        self.dones = np.zeros(capacity, dtype=np.bool_)
        self.position = 0
        self.size = 0

    def push(self, state, action, reward, next_state, done):
        self.states[self.position] = state
        self.next_states[self.position] = next_state
        self.actions[self.position] = action
        self.rewards[self.position] = reward
        self.dones[self.position] = done

        self.position = (self.position + 1) % self.capacity
        self.size = min(self.size + 1, self.capacity)

    def sample(self, batch_size):
        indices = np.random.choice(self.size, batch_size, replace=False)
        return (
            self.states[indices],
            self.actions[indices],
            self.rewards[indices],
            self.next_states[indices],
            self.dones[indices]
        )

    def __len__(self):
        return self.size

    def memory_usage_gb(self):
        total_bytes = (self.states.nbytes + self.next_states.nbytes +
                       self.actions.nbytes + self.rewards.nbytes +
                       self.dones.nbytes)
        return total_bytes / (1024 ** 3)


In [8]:
# Initialize replay memory
state_shape = (4, RESIZE_HEIGHT, RESIZE_WIDTH)
replay_memory = ReplayMemory(MEMORY_CAPACITY, state_shape)

# Initialize networks
online_net = DQN(env.action_space.n).to(device)
target_net = DQN(env.action_space.n).to(device)
target_net.load_state_dict(online_net.state_dict())
target_net.eval()

# === Load saved weights to continue training ===
if LOAD_MODEL:
    online_net.load_state_dict(torch.load(LOAD_MODEL, map_location=device))
    target_net.load_state_dict(online_net.state_dict())

# Optimizer
optimizer = optim.RMSprop(
    online_net.parameters(),
    lr=LEARNING_RATE,
    alpha=0.95,
    momentum=0.95,
    eps=0.01
)

NameError: name 'env' is not defined

In [None]:
class VectorAgent:
    def __init__(self, num_envs, initial_epsilon, epsilon_decay, final_epsilon, discount_factor):
        self.num_envs = num_envs
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon
        self.discount_factor = discount_factor

    def get_actions(self, states, model):
        """Get actions for all environments in a batch"""
        # Convert states to tensor
        states_tensor = torch.tensor(states, dtype=torch.float32, device=device) / 255.0

        with torch.no_grad():
            q_values = model(states_tensor)

        # Epsilon-greedy for each environment
        actions = q_values.argmax(1).cpu().numpy()
        rand_mask = np.random.rand(self.num_envs) < self.epsilon
        actions[rand_mask] = np.array([vec_env.single_action_space.sample() for _ in range(self.num_envs)])[rand_mask]

        return actions

    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon - self.epsilon_decay)

In [None]:
# Initialize agent
agent = VectorAgent(
    num_envs=N_ENVS,
    initial_epsilon=START_EPSILON,
    epsilon_decay=EPSILON_DECAY,
    final_epsilon=FINAL_EPSILON,
    discount_factor=DISCOUNT_FACTOR,
)

# Training setup
total_steps = 0
episode_rewards = []
episode_losses = []
episode_steps = []
best_reward = -float('inf')

# Initialize frame buffers for each environment
frame_buffers = [deque(maxlen=4) for _ in range(N_ENVS)]
current_states = np.zeros((N_ENVS, 4, RESIZE_HEIGHT, RESIZE_WIDTH))

def reset_env(i):
    """Reset a single environment"""
    obs, _ = vec_env.reset(index=[i])
    frame = preprocess(obs[0])
    frame_buffers[i].clear()
    for _ in range(4):
        frame_buffers[i].append(frame)
    return np.stack(frame_buffers[i], axis=0)

# Initialize all environments
print("Initializing parallel environments...")
obs, _ = vec_env.reset()
for i in range(N_ENVS):
    frame = preprocess(obs[i])
    for _ in range(4):
        frame_buffers[i].append(frame)
    current_states[i] = np.stack(frame_buffers[i], axis=0)

# Pre-fill replay memory
print("Pre-filling replay memory...")
while len(replay_memory) < MEMORY_FILL_SIZE:
    # Random actions for all environments
    actions = np.array([vec_env.single_action_space.sample() for _ in range(N_ENVS)])

# Inside the training loop:
next_obs, rewards, dones, truncateds, infos = vec_env.step(actions)

# Process each environment
next_states = np.zeros_like(current_states)
for i in range(N_ENVS):
    next_frame = preprocess(next_obs[i])
    frame_buffers[i].append(next_frame)
    next_states[i] = np.stack(frame_buffers[i], axis=0)
    
    if REWARD_CLIP:
        rewards[i] = np.clip(rewards[i], -1, 1)
        
    replay_memory.push(
        current_states[i].copy(),
        actions[i],
        rewards[i],
        next_states[i].copy(),
        dones[i] or truncateds[i]
    )
    
    if dones[i] or truncateds[i]:
        next_states[i] = reset_env(i)
        # Record episode completion
        episode_rewards.append(episode_reward[i])
        episode_steps.append(steps)
        episode_reward[i] = 0


print(f"Replay memory filled with {len(replay_memory)} transitions")

In [None]:
# Reset environment and initialize frame stack
obs, info = env.reset()
frame = preprocess(obs)

# Visualize the first preprocessed frame
plt.imshow(frame, cmap='gray')
plt.title(f"Preprocessed Frame\nSize: {frame.shape} | Crop: T:{CROP_TOP} B:{CROP_BOTTOM} L:{CROP_LEFT} R:{CROP_RIGHT}")
plt.show()

frame_stack = deque(maxlen=4)
for _ in range(4):
    frame_stack.append(frame)
state = np.stack(frame_stack, axis=0)

# Pre-fill replay memory
print("Pre-filling replay memory...")
for _ in tqdm(range(MEMORY_FILL_SIZE), desc="Pre-filling replay memory"):
    action = env.action_space.sample()
    next_obs, reward, done, truncated, info = env.step(action)

    # Clip reward if enabled
    if REWARD_CLIP:
        reward = np.clip(reward, -1, 1)

    next_frame = preprocess(next_obs)
    frame_stack.append(next_frame)
    next_state = np.stack(frame_stack, axis=0)

    replay_memory.push(state, action, reward, next_state, done)
    state = next_state

    if done or truncated:
        obs, info = env.reset()
        frame = preprocess(obs)
        frame_stack.clear()
        for _ in range(4):
            frame_stack.append(frame)
        state = np.stack(frame_stack, axis=0)



In [None]:
# Training loop with parallel environments
print("Starting training with parallel environments...")
total_updates = 0
start_time = time.time()

for episode in tqdm(range(N_EPISODES), desc="Training episodes"):
    steps = 0
    episode_reward = np.zeros(N_ENVS)
    step_losses = []

    # Collect N_STEPS from all environments
    for _ in range(N_STEPS):
        # Get actions for all environments
        actions = agent.get_actions(current_states, online_net)

        # Step all environments
        next_obs, rewards, dones, truncateds, _ = vec_env.step(actions)

        # Process each environment
        next_states = np.zeros_like(current_states)
        for i in range(N_ENVS):
            # Preprocess frame
            next_frame = preprocess(next_obs[i])

            # Update frame buffer
            frame_buffers[i].append(next_frame)
            next_states[i] = np.stack(frame_buffers[i], axis=0)

            # Clip reward
            if REWARD_CLIP:
                rewards[i] = np.clip(rewards[i], -10, 10)

            # Update episode reward
            episode_reward[i] += rewards[i]

            # Store transition
            replay_memory.push(
                current_states[i].copy(),
                actions[i],
                rewards[i],
                next_states[i].copy(),
                dones[i] or truncateds[i]
            )

            # Reset if done
            if dones[i] or truncateds[i]:
                next_states[i] = reset_env(i)
                # Record completed episode
                episode_rewards.append(episode_reward[i])
                episode_steps.append(steps)
                episode_reward[i] = 0

        current_states = next_states
        steps += 1
        total_steps += N_ENVS

    # Perform multiple training updates
    for _ in range(N_UPDATES):
        if len(replay_memory) > MINIBATCH_SIZE:
            batch_data = replay_memory.sample(MINIBATCH_SIZE)
            states, actions, rewards, next_states, dones = batch_data

            # Convert to tensors on GPU
            states_tensor = torch.as_tensor(states, device=device, dtype=torch.float32) / 255.0
            next_states_tensor = torch.as_tensor(next_states, device=device, dtype=torch.float32) / 255.0
            actions_tensor = torch.as_tensor(actions, device=device, dtype=torch.long)
            rewards_tensor = torch.as_tensor(rewards, device=device, dtype=torch.float32)
            dones_tensor = torch.as_tensor(dones, device=device, dtype=torch.float32)

            # Compute Q-values
            current_q = online_net(states_tensor).gather(1, actions_tensor.unsqueeze(1)).squeeze(1)

            # Compute target Q-values
            with torch.no_grad():
                next_q = target_net(next_states_tensor).max(1)[0]
                target_q = rewards_tensor + DISCOUNT_FACTOR * next_q * (1 - dones_tensor)

            # Compute loss
            loss = F.mse_loss(current_q, target_q)
            step_losses.append(loss.item())

            # Optimize
            optimizer.zero_grad()
            loss.backward()

            # Gradient clipping
            for param in online_net.parameters():
                if param.grad is not None:
                    param.grad.data.clamp_(-1, 1)

            optimizer.step()

            total_updates += 1

    # Update target network periodically
    if total_updates % (TARGET_UPDATE_FREQ // N_UPDATES) == 0:
        target_net.load_state_dict(online_net.state_dict())

    # Decay epsilon
    for _ in range(N_STEPS * N_ENVS):
        agent.decay_epsilon()

    # Record metrics
    if step_losses:
        avg_loss = sum(step_losses) / len(step_losses)
        episode_losses.append(avg_loss)
    else:
        episode_losses.append(0)

    # Save best model
    current_avg_reward = np.mean(episode_rewards[-10:]) if episode_rewards else 0
    if current_avg_reward > best_reward:
        best_reward = current_avg_reward
        torch.save(online_net.state_dict(), f"{MODEL_FILE}_best.pth")

    # Logging
    if episode % 10 == 0 and episode_rewards:
        avg_reward = np.mean(episode_rewards[-50:]) if len(episode_rewards) >= 50 else np.mean(episode_rewards)
        avg_steps = np.mean(episode_steps[-50:]) if len(episode_steps) >= 50 else np.mean(episode_steps)
        mem_usage = replay_memory.memory_usage_gb()

        # Calculate steps per second
        elapsed = time.time() - start_time
        steps_per_sec = total_steps / elapsed if elapsed > 0 else 0

        print(f"Episode {episode}: "
              f"Avg Reward={avg_reward:.2f}, "
              f"Avg Steps={avg_steps:.0f}, "
              f"Epsilon={agent.epsilon:.4f}, "
              f"Loss={episode_losses[-1]:.4f}, "
              f"Steps={total_steps}, "
              f"Speed={steps_per_sec:.1f} steps/s")

# Save final model
torch.save(online_net.state_dict(), f"{MODEL_FILE}_final.pth")
print("Training complete. Models saved.")


In [None]:
# Plot results
plt.figure(figsize=(18, 5))

# Plot rewards
plt.subplot(1, 3, 1)
plt.plot(episode_rewards)
plt.title("Episode Rewards")
plt.xlabel("Episode")
plt.ylabel("Total Reward")

# Plot loss
plt.subplot(1, 3, 2)
plt.plot(episode_losses)
plt.title("Training Loss")
plt.xlabel("Episode")
plt.ylabel("Average Loss")

# Plot steps
plt.subplot(1, 3, 3)
plt.plot(episode_steps)
plt.title("Episode Steps")
plt.xlabel("Episode")
plt.ylabel("Steps Taken")

plt.tight_layout()
plt.savefig("training_results.png")
plt.show()
