# Tutorial 12: Convolutional Neural Networks

Implementing convolutions from scratch and understanding what CNNs learn.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
np.random.seed(42)
torch.manual_seed(42)

## Part 1: Convolution from Scratch

In [None]:
def conv2d_naive(image, kernel, stride=1, padding=0):
    """
    2D convolution (naive implementation for understanding).
    
    Args:
        image: Input image, shape (H, W)
        kernel: Convolution kernel, shape (K, K)
        stride: Step size
        padding: Zero-padding size
    
    Returns:
        Output feature map
    """
    # Pad image
    if padding > 0:
        image = np.pad(image, padding, mode='constant', constant_values=0)
    
    H, W = image.shape
    K = kernel.shape[0]
    
    # Output size
    H_out = (H - K) // stride + 1
    W_out = (W - K) // stride + 1
    
    output = np.zeros((H_out, W_out))
    
    for i in range(H_out):
        for j in range(W_out):
            # Extract patch
            patch = image[i*stride:i*stride+K, j*stride:j*stride+K]
            # Convolve (element-wise multiply and sum)
            output[i, j] = np.sum(patch * kernel)
    
    return output

# Test with edge detection kernels
# Create a simple image with edges
image = np.zeros((10, 10))
image[2:8, 2:8] = 1  # White square in center

# Common edge detection kernels
kernels = {
    'Horizontal Edge': np.array([[-1, -1, -1],
                                  [ 0,  0,  0],
                                  [ 1,  1,  1]]),
    'Vertical Edge': np.array([[-1, 0, 1],
                                [-1, 0, 1],
                                [-1, 0, 1]]),
    'Sobel X': np.array([[-1, 0, 1],
                         [-2, 0, 2],
                         [-1, 0, 1]]),
    'Laplacian': np.array([[ 0, -1,  0],
                           [-1,  4, -1],
                           [ 0, -1,  0]])
}

fig, axes = plt.subplots(2, 3, figsize=(12, 8))

# Original image
axes[0, 0].imshow(image, cmap='gray')
axes[0, 0].set_title('Original Image')
axes[0, 0].axis('off')

# Apply kernels
for ax, (name, kernel) in zip(axes.flatten()[1:], kernels.items()):
    output = conv2d_naive(image, kernel, padding=1)
    ax.imshow(output, cmap='gray')
    ax.set_title(f'{name}')
    ax.axis('off')

plt.tight_layout()
plt.show()

print("Each kernel detects different features (edges, corners, etc.)")

## Part 2: Verify Against PyTorch

In [None]:
# Compare our implementation with PyTorch
image_np = np.random.randn(5, 5)
kernel_np = np.random.randn(3, 3)

# Our implementation
our_output = conv2d_naive(image_np, kernel_np, padding=1)

# PyTorch
image_torch = torch.tensor(image_np).float().unsqueeze(0).unsqueeze(0)  # (1, 1, H, W)
kernel_torch = torch.tensor(kernel_np).float().unsqueeze(0).unsqueeze(0)  # (1, 1, K, K)
pytorch_output = F.conv2d(image_torch, kernel_torch, padding=1).squeeze().numpy()

print("Our output shape:", our_output.shape)
print("PyTorch output shape:", pytorch_output.shape)
print(f"Max difference: {np.abs(our_output - pytorch_output).max():.2e}")
print("✓ Implementations match!")

## Part 3: Receptive Field Visualization

In [None]:
def compute_receptive_field(layers):
    """
    Compute receptive field size for stacked conv layers.
    
    layers: list of (kernel_size, stride) tuples
    """
    rf = 1  # Start with single pixel
    stride_product = 1
    
    for k, s in layers:
        rf = rf + (k - 1) * stride_product
        stride_product *= s
    
    return rf

# Compare different architectures
architectures = {
    'One 7x7': [(7, 1)],
    'Three 3x3': [(3, 1), (3, 1), (3, 1)],
    'Two 5x5': [(5, 1), (5, 1)],
    'VGG-style (3x3s + pool)': [(3, 1), (3, 1), (2, 2), (3, 1), (3, 1), (2, 2)],
}

print("Receptive Field Comparison:")
print("=" * 50)
for name, layers in architectures.items():
    rf = compute_receptive_field(layers)
    params = sum(k*k for k, s in layers if s == 1)  # Ignoring pooling
    print(f"{name:30s}: RF = {rf:3d}, params ∝ {params}")

print("\nInsight: Three 3x3 convs have same RF as one 7x7, but fewer parameters!")
print("7x7 = 49 params, 3×(3×3) = 27 params")

## Part 4: Build and Train a CNN

In [None]:
class SimpleCNN(nn.Module):
    def __init__(self):
        super().__init__()
        # Conv layers
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        
        # Pooling
        self.pool = nn.MaxPool2d(2, 2)
        
        # Fully connected
        self.fc1 = nn.Linear(64 * 3 * 3, 128)
        self.fc2 = nn.Linear(128, 10)
        
    def forward(self, x):
        # Conv blocks
        x = self.pool(F.relu(self.conv1(x)))  # 28 -> 14
        x = self.pool(F.relu(self.conv2(x)))  # 14 -> 7
        x = self.pool(F.relu(self.conv3(x)))  # 7 -> 3
        
        # Flatten and FC
        x = x.view(-1, 64 * 3 * 3)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Load MNIST
transform = transforms.Compose([transforms.ToTensor()])
train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST('./data', train=False, transform=transform)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1000)

# Train
model = SimpleCNN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

train_losses = []
test_accs = []

for epoch in range(5):
    model.train()
    epoch_loss = 0
    for x, y in train_loader:
        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    # Test accuracy
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for x, y in test_loader:
            out = model(x)
            correct += (out.argmax(1) == y).sum().item()
            total += len(y)
    
    train_losses.append(epoch_loss / len(train_loader))
    test_accs.append(correct / total)
    print(f"Epoch {epoch+1}: Loss={train_losses[-1]:.4f}, Test Acc={test_accs[-1]:.2%}")

## Part 5: Visualize Learned Filters

In [None]:
# Visualize first layer filters
filters = model.conv1.weight.detach().numpy()

fig, axes = plt.subplots(4, 4, figsize=(8, 8))
for i, ax in enumerate(axes.flatten()):
    if i < filters.shape[0]:
        ax.imshow(filters[i, 0], cmap='gray')
        ax.axis('off')
    else:
        ax.axis('off')

plt.suptitle('Learned Conv1 Filters (3x3)', fontsize=14)
plt.tight_layout()
plt.show()

print("First layer learns edge-like and blob-like patterns")

In [None]:
# Visualize feature maps for a sample
sample_image, label = test_dataset[0]

# Get activations
model.eval()
with torch.no_grad():
    x = sample_image.unsqueeze(0)
    act1 = F.relu(model.conv1(x))
    act2 = F.relu(model.conv2(model.pool(act1)))
    act3 = F.relu(model.conv3(model.pool(act2)))

fig, axes = plt.subplots(3, 6, figsize=(15, 8))

# Original
axes[0, 0].imshow(sample_image.squeeze(), cmap='gray')
axes[0, 0].set_title(f'Input (label={label})')
axes[0, 0].axis('off')
for ax in axes[0, 1:]:
    ax.axis('off')

# Conv1 activations
for i in range(5):
    axes[1, i].imshow(act1[0, i].numpy(), cmap='viridis')
    axes[1, i].set_title(f'Conv1 ch{i}')
    axes[1, i].axis('off')
axes[1, 5].axis('off')

# Conv2 activations
for i in range(5):
    axes[2, i].imshow(act2[0, i].numpy(), cmap='viridis')
    axes[2, i].set_title(f'Conv2 ch{i}')
    axes[2, i].axis('off')
axes[2, 5].axis('off')

plt.suptitle('Feature Maps Through the Network', fontsize=14)
plt.tight_layout()
plt.show()

print("Deeper layers have smaller spatial size but detect more complex patterns")

## Summary

**Key insights:**
1. **Convolution** = sliding dot product, detects local patterns
2. **Weight sharing** = same filter everywhere = translation invariance
3. **Pooling** = downsampling + invariance
4. **Hierarchy** = edges → textures → parts → objects
5. **Receptive field** grows with depth