In [None]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

# Convolutional Neural Networks

## Goals

- Convolutions: Apply on images
- CNNs: Define, Optimize, Inspect, Understand

## Google Colab Check

In [None]:
import sys

# Detect Colab
IN_COLAB = "google.colab" in sys.modules
print(f"In Colab: {IN_COLAB}")

# Show prominent message if in Colab
if IN_COLAB:
    try:
        from IPython.display import Markdown, display

        display(
            Markdown(
                """
> 💾 **Optionally:**  
> Save this notebook to your **personal Google Drive** to persist any changes.
>
> *Go to `File ▸ Save a copy in Drive` before editing.*
            """
            )
        )
    except Exception:
        print(
            "\n💾 Optionally: Save the notebook to your personal Google Drive to persist changes.\n"
        )

We mount google drive to store data.

In [None]:
if IN_COLAB:
    from google.colab import drive

    drive.mount("/content/drive")

## Specify Data Path

**Modify the following paths if necessary.**

That is where your data will be stored.

In [None]:
from pathlib import Path

if IN_COLAB:
    DATA_PATH = Path("/content/drive/MyDrive/cas-dl-module-compvis-part1")
else:
    DATA_PATH = Path("../../data")
assert DATA_PATH.exists(), f"PATH: {DATA_PATH} does not exist."

## Install Lectures Package

Install `dl_cv_lectures` package with all necessary dependencies.

This package provides the environment of the exercises-repository, as well as helper- and utils modules: [Link](https://github.com/marco-willi/cas-dl-compvis-exercises-hs2025)

The following code installs the package from a local repository (if available), otherwise it installs it from the exercise repository.

In [None]:
import subprocess
import sys
from pathlib import Path

from rich.console import Console

console = Console()


def ensure_dl_cv_lectures():
    """Ensure dl_cv_lectures is installed (local or from GitHub)."""
    try:
        import dl_cv_lectures

        console.print(
            "[bold green]✅ dl_cv_lectures installed — all good![/bold green]"
        )
        return
    except ImportError:
        console.print("[bold yellow]⚠️ dl_cv_lectures not found.[/bold yellow]")
    repo_path = Path("/workspace/pyproject.toml")
    if repo_path.exists():
        console.print("[cyan]📦 Installing from local repository...[/cyan]")
        cmd = [sys.executable, "-m", "pip", "install", "-e", "/workspace"]
    else:
        console.print("[cyan]🌐 Installing from GitHub repository...[/cyan]")
        cmd = [
            sys.executable,
            "-m",
            "pip",
            "install",
            "git+https://github.com/marco-willi/cas-dl-compvis-exercises-hs2025",
        ]
    try:
        subprocess.run(cmd, check=True)
        console.print("[bold green]✅ Installation successful![/bold green]")
    except subprocess.CalledProcessError as e:
        console.print(f"[bold red]❌ Installation failed ({e}).[/bold red]")


ensure_dl_cv_lectures()

### Load Libraries

Load all libraries and packages used in this exercise.

In [None]:
import io
from collections.abc import Callable

import requests
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchinfo
import torchshow as ts
import torchvision
from matplotlib import pyplot as plt
from PIL import Image
from torchvision.transforms.v2 import functional as TF
from tqdm.notebook import tqdm

from dl_cv_lectures.transform import RandomQuadrantPad

Define a default device for your computations.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using: {device}")

## 1) Convolutions in PyTorch

We apply a _convolution_ on images.

Let's get an image.

In [None]:
url = "https://github.com/pytorch/vision/blob/main/gallery/assets/dog2.jpg?raw=true"
r = requests.get(url, allow_redirects=True)
image = Image.open(io.BytesIO(r.content))

image

Now we define a kernel / filter by hand. Let's take a look at it.

In [None]:
# define kernel by hand
conv_kernel = (
    torch.tensor(
        [
            [[1, 0, -1], [1, 0, -1], [1, 0, -1]],  # R
            [[1, 0, -1], [1, 0, -1], [1, 0, -1]],  # G
            [[1, 0, -1], [1, 0, -1], [1, 0, -1]],  # B
        ]
    )
    .unsqueeze(0)
    .float()
)


ts.show(conv_kernel, show_axis=False)

Now we apply the kernel to the image. We need to convert the image to a `torch.tensor` first.

In [None]:
image_tensor = TF.pil_to_tensor(image).float() / 255.0

activations = F.conv2d(
    image_tensor, conv_kernel, stride=1, padding=0, dilation=1, groups=1
)

# rescale activations to visualize them as an image
activations_scaled = (activations - activations.min()) / (
    activations.max() - activations.min()
)

ts.show(activations_scaled)

We can compare the activations to the input image. What happened?

In [None]:
fig, axes = plt.subplots(figsize=(18, 6), ncols=3)
# activations
ax = axes[0]
img = TF.to_pil_image(activations_scaled)
img_ax = ax.imshow(img)
_ = ax.axis("off")
cbar = fig.colorbar(img_ax, ax=ax, orientation="horizontal", fraction=0.046, pad=0.04)
cbar.set_label("Activation value", fontsize=10)
# activations - center crop
img_crop = TF.center_crop(activations_scaled, output_size=(128, 128))
img = TF.to_pil_image(img_crop)
ax = axes[1]
img_ax = ax.imshow(img)
_ = ax.axis("off")
cbar = fig.colorbar(img_ax, ax=ax, orientation="horizontal", fraction=0.046, pad=0.04)
cbar.set_label("Activation value", fontsize=10)
# original image
ax = axes[2]
img_crop = TF.center_crop(TF.pil_to_tensor(image), output_size=(128, 128))
img = TF.to_pil_image(img_crop)
img_ax = ax.imshow(img)
_ = ax.axis("off")
dummy_cbar = fig.colorbar(
    img_ax, ax=ax, orientation="horizontal", fraction=0.046, pad=0.04
)
dummy_cbar.ax.set_visible(False)
plt.tight_layout()

**Question**: What did the convolutional kernel do / what image features lead to high activations?

<details>
<summary>Click to reveal answer</summary>

A simple way to interpret a filter is to visually inspect it.

Our filter is $3x3$ in size with columns of identical values, from left to right:

- high value

- zero value

- negative value


On an image with values in the range of 0 and 255 the output activations will be highest when the input activations are high (bright) for the first column, irrelevant for the second, and low (dark) on the third column. This means strong vertical edges from left (bright) to right (dark) will activate strongly.

</details>

There are two approaches to apply operations in PyTorch:

- *Functional*: These are *stateless* functions [nn.functional](https://pytorch.org/docs/stable/nn.functional.html)
- *Modules*: Using *stateful* objects which are used in neural networks [torch.nn.Conv2d](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html)

Now we perform a *convolution* using a *module*. [torch.nn.Conv2d](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html)

In [None]:
# module
conv_module = torch.nn.Conv2d(
    in_channels=3,
    out_channels=1,
    kernel_size=3,
    stride=1,
    padding=0,
    dilation=1,
    groups=1,
)

activations = conv_module(image_tensor)

# rescale result to visualize it as an image
activations = (activations - activations.min()) / (
    activations.max() - activations.min()
)


ts.show(activations)

**Question**: What is the difference between the *functional* and *module* approach? What happens in the *module* approach?

<details>
<summary>Click to reveal answer</summary>

**Functional approach (`torch.nn.functional`):**
- Functions that perform operations **stateless**ly
- No parameters are stored/maintained
- You need to pass parameters (like filters/weights) explicitly each time
- Example: `F.conv2d(input, weight, ...)`
- More flexible but requires manual parameter management

**Module approach (`torch.nn.Module`):**
- Classes that **encapsulate both parameters and operations**
- Parameters (weights, biases) are automatically registered and managed
- Gradients are tracked automatically during backpropagation
- Example: `nn.Conv2d(...)` creates a layer with learnable parameters
- Parameters accessible via `.parameters()` method
- Better for building neural networks with trainable weights

**What happens in the module approach:**
1. When you instantiate a module (e.g., `nn.Conv2d(1, 16, 5)`), it **initializes learnable parameters** (filters/weights and optionally biases)
2. These parameters are registered as part of the module's state
3. During forward pass, the module applies the operation using its stored parameters
4. During training, optimizer can access and update these parameters via `.parameters()`

**Use functional when:** You want to apply operations with custom/fixed parameters
**Use module when:** Building neural networks with learnable parameters (most common case)

</details>

Now apply the following operations to the image using the *functional* approach:

- Convolution
- Max Pooling
- Convolution

You can use the filter from above, if possible.

In [None]:
x = F.conv2d(image_tensor, conv_kernel, stride=1, padding=0, dilation=1, groups=1)
x = F.max_pool2d(x, kernel_size=(2, 2), stride=(2, 2))
activations = F.conv2d(
    x, conv_kernel[:, 0:1, :, :], stride=1, padding=0, dilation=1, groups=1
)

activations = (activations - activations.min()) / (
    activations.max() - activations.min()
)
ts.show(activations)

## 2) CNN Properties

In the following we will conduct a few experiments to understand how CNNs work and to contrast them with MLPs.

### Data

We create a modified MNIST dataset.

In [None]:
# Create the MNIST dataset with the custom transform
ds_mnist_train = torchvision.datasets.MNIST(
    root=DATA_PATH,
    train=True,
    download=True,
    transform=RandomQuadrantPad(choices=["top_left"]),
)

# Create the MNIST dataset with the custom transform
ds_mnist_test_tl = torchvision.datasets.MNIST(
    root=DATA_PATH,
    train=False,
    download=True,
    transform=RandomQuadrantPad(choices=["top_left"]),
)

# Create the MNIST dataset with the custom transform
ds_mnist_test_br = torchvision.datasets.MNIST(
    root=DATA_PATH,
    train=False,
    download=True,
    transform=RandomQuadrantPad(choices=["bottom_right"]),
)

Let's visualize a few data points.

In [None]:
dl_mnist_train = torch.utils.data.DataLoader(
    ds_mnist_train, batch_size=12, shuffle=True, num_workers=4
)

# Let's check the first batch
images, labels = next(iter(dl_mnist_train))

ts.show(images)

**Question**: What do you notice?

<details>
<summary>Click to reveal answer</summary>

- **Digits are positioned in the top left qudrant**
- **Large black borders**: Most of the image is black/zero-padding with the actual digit occupying only 1/4 of the space
- **Image size increased**: Original 28×28 MNIST images are now embedded in 56×56 images (2× in each dimension)

</details>

### Architecture Definition

We define a CNN with the following architecture:

- Input Shape: (1, 28 *  2, 28 *2)
- Convolution1: 8 Filters, Kernel-Size 5x5
- Max Pooling: Stride 2, Kernel-Size 2
- Convolution2: 16 Filter, Kernel-Size 5x5
- Max Pooling: Stride 2, Kernel-Size 2
- FC1: 32 neurons
- FC2: 16 neurons
- FC3: 10 neurons (because we have 10 classes)

We use ReLU after each layer and subclass `torch.nn.Module`.

**Task:** Implement the CNN as defined above in the next cell.

<details>
<summary>Click to reveal answer</summary>

```python
class Net(nn.Module):
    def __init__(self, num_channel=8, num_classes=10):
        super().__init__()

        # --- Feature extractor ---
        self.features = nn.Sequential(
            # Conv block 1
            nn.Conv2d(1, num_channel, kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Conv block 2
            nn.Conv2d(num_channel, num_channel * 2, kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        # --- Classifier ---
        
        self.classifier = nn.Sequential(
            nn.Flatten(),
            # 11 * 11 is the output - we can either specify explicitly 
            # or use nn.LazyLinear which infers the number of input
            # features at runtime
            #nn.Linear(num_channel * 2 * 11 * 11, num_channel * 4),
            nn.LazyLinear(num_channel * 4)
            nn.ReLU(),
            nn.Linear(num_channel * 4, num_channel * 2),
            nn.ReLU(),
            nn.Linear(num_channel * 2, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x
```
</details>

In [None]:
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self, num_channel=8, num_classes=10):
        super().__init__()

        # --- Feature extractor ---
        self.features = nn.Sequential(
            # Conv block 1
            nn.Conv2d(1, num_channel, kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # Conv block 2
            nn.Conv2d(num_channel, num_channel * 2, kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        # --- Classifier ---

        self.classifier = nn.Sequential(
            nn.Flatten(),
            # 11 * 11 is the output
            # nn.Linear(num_channel * 2 * 11 * 11, num_channel * 4),
            nn.LazyLinear(num_channel * 4),
            nn.ReLU(),
            nn.Linear(num_channel * 4, num_channel * 2),
            nn.ReLU(),
            nn.Linear(num_channel * 2, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x


net = Net()

print(net)
print(torchinfo.summary(net, input_size=(1, 1, 56, 56)))

**Question**: Explain why layer FC1 is defined with `nn.Linear(num_channel*2 * 11 * 11, num_channel*4)`.

<details>
<summary>Click to reveal answer</summary>
**Explanation of `nn.Linear(num_channel*2 * 11 * 11, num_channel*4)`:**

This linear layer connects the flattened convolutional output to the first fully-connected layer. Let's break down why it's `11 × 11`:

1. **Input image**: 56×56
2. **After conv1 (5×5, no padding)**: (56 - 5 + 1) = 52×52
3. **After pool1 (2×2, stride=2)**: 52/2 = 26×26
4. **After conv2 (5×5, no padding)**: (26 - 5 + 1) = 22×22
5. **After pool2 (2×2, stride=2)**: 22/2 = **11×11**

With `num_channel*2` output channels from conv2, the flattened size is: `num_channel*2 × 11 × 11`
</details>


**Question**: Examine the number of parameters per layer. What do you notice?

<details>
<summary>Click to reveal answer</summary>



**Parameters per layer observations:**

Looking at the torchinfo summary, you should notice:
- **Convolutional layers have relatively few parameters**: A 5×5 conv with C_in=1, C_out=8 has only 8×(5×5 + 1) = 208 params
- **First fully-connected layer has MANY parameters**: num_channel*2 * 11 * 11 * num_channel*4 is huge (e.g., 61,984 params with num_channel=8)
- **FC layers dominate the parameter count**
- **This is why modern CNNs minimize FC layers**: Using techniques like Global Average Pooling instead

**Key insight:** The spatial dimensions (11×11) multiply the parameter count dramatically in FC layers, which is a major inefficiency compared to convolutional layers that share weights spatially.

</details>

### Training

We define loss function and optimizer. Since we are modelling a classification problem we use the _cross-entropy loss_. The AdamW-Optimizer is a good default.

In [None]:
import torch.optim as optim

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.AdamW(net.parameters())

Let's define the training-loop.

In [None]:
def train_one_epoch(
    data_loader: torch.utils.data.DataLoader,
    net: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    loss_fn: Callable,
    device: str = "cpu",
    verbose: bool = True,
):
    net = net.to(device)

    with tqdm(data_loader, unit="batch", disable=not verbose) as tepoch:
        total_samples_seen = 0
        total_correct = 0

        for _, (X, y) in enumerate(tepoch):
            # Update Step
            logits = net(X.to(device))
            loss = loss_fn(logits, y.to(device))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Calculate Accuracy
            class_probabilities = torch.softmax(logits, axis=-1).detach().cpu()
            y_hat = (
                class_probabilities.argmax(dim=1, keepdim=True).squeeze().detach().cpu()
            )

            num_correct = (y_hat == y).sum().item()
            num_samples = X.shape[0]
            batch_accuracy = num_correct / num_samples

            # Epoch Statistics
            total_samples_seen += num_samples
            total_correct += num_correct
            epoch_accuracy = total_correct / total_samples_seen

            if verbose:
                tepoch.set_postfix(
                    loss=loss.item(),
                    accuracy_batch=batch_accuracy,
                    accuracy_epoch=epoch_accuracy,
                )

Now we train our model.

In [None]:
dl_mnist_train = torch.utils.data.DataLoader(
    ds_mnist_train, batch_size=256, shuffle=True, num_workers=4
)

total_epochs = 5
for epoch in range(0, total_epochs):
    print(f"Starting Epoch: {epoch + 1} / {total_epochs}")
    train_one_epoch(dl_mnist_train, net, optimizer, loss_fn, device=device)

We evaluate our model on test data. Lets define the test dataset and look at a few samples.


In [None]:
dl_mnist_test_tl = torch.utils.data.DataLoader(
    ds_mnist_test_tl, batch_size=32, shuffle=False
)

images, labels = next(iter(dl_mnist_test_tl))

ts.show(images)

**Question**: Do you think the model will perform well?

<details>
<summary>Click to reveal answer</summary>

Tthe model should perform well on this test set.

**Reasoning:**
- **Training data**: The model was trained with `RandomQuadrantPad()` using a fixed quadrant (top-left).
- **Test data**: The `ds_mnist_test_tl` dataset has digits also only in the top-left quadrant.
- **Training covered this case**: Since training included digits in the top-left quadrant, the model has seen this configuration before

**Expected outcome:**
- **High accuracy** (likely >95%), the model should recognize digits in the top-left corner
</details>

In [None]:
def eval_loop(
    data_loader: torch.utils.data.DataLoader,
    net: torch.nn.Module,
    loss_fn: Callable,
    device: str = "cpu",
) -> tuple[float, torch.Tensor, torch.Tensor]:
    net = net.to(device)
    net.eval()
    with tqdm(data_loader, unit="batch") as tepoch:
        total_samples_seen = 0
        total_correct = 0

        y_list = []
        y_hat_list = []

        for _, (X, y) in enumerate(tepoch):
            # Forward Pass
            with torch.no_grad():
                logits = net(X.to(device))
            loss = loss_fn(logits, y.to(device))

            # Predictions
            class_probabilities = torch.softmax(logits, axis=-1).detach().cpu()
            y_hat = (
                class_probabilities.argmax(dim=1, keepdim=True).squeeze().detach().cpu()
            )

            # Metrics
            num_correct = (y_hat == y).sum().item()
            num_samples = X.shape[0]
            total_samples_seen += num_samples
            total_correct += num_correct
            epoch_accuracy = total_correct / total_samples_seen

            tepoch.set_postfix(
                loss=loss.item(),
                accuracy_epoch=epoch_accuracy,
            )

            # save preds and targets
            y_list.append(y.cpu())
            y_hat_list.append(y_hat.cpu())

    return epoch_accuracy, torch.concat(y_list), torch.concat(y_hat_list)

In [None]:
test_accuracy, y, y_hat = eval_loop(dl_mnist_test_tl, net, loss_fn, device=device)

print(f"Test Accuracy:  {test_accuracy:.3f}")

Now, we use the following test data.

In [None]:
dl_mnist_test_br = torch.utils.data.DataLoader(
    ds_mnist_test_br, batch_size=32, shuffle=False
)

images, labels = next(iter(dl_mnist_test_br))

ts.show(images)

**Question**: How good do you think the model works for this case?

<details>
<summary>Click to reveal answer</summary>

**One possible hypothesis could be:**

- Test data has digits only in the *bottom-right quadrant*.
- The CNN was trained with digits only in the top-left quadrant, however, the same convolutional filters that detect edges, curves, and digit features in the top-left also detect them in the bottom-right.

**Expected results:**: It might work! Similar accuracy to top-left test set** (likely >95%)

</details>

Validate your hypothesis.

In [None]:
test_accuracy, y, y_hat = eval_loop(dl_mnist_test_br, net, loss_fn, device=device)

print(f"Test Accuracy:  {test_accuracy:.3f}")

**Question**: What happened?

<details>
<summary>Click to reveal answer</summary>

**Oooopsy!!**: That does not look very well.


**The issue is the fully connected layers**: The fully connected layers are not translation equivariant and thus have likely weights near zero for each neuron connected to quadrants without digits seen during trainig.

</details>

**Task**: Try to improve the model by making architectural changes.


<details>
<summary>Click to reveal answer</summary>

```python
class Net(nn.Module):
    def __init__(self, num_channel=8, num_classes=10):
        super().__init__()

        # --- Feature extractor ---
        self.features = nn.Sequential(
            # Conv block 1
            nn.Conv2d(1, num_channel, kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Conv block 2
            nn.Conv2d(num_channel, num_channel * 2, kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        # --- Classifier ---
        
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d(output_size=(1)),
            nn.Flatten(),
            nn.Linear(num_channel * 2, num_channel),
            nn.ReLU(),
            nn.Linear(num_channel, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x
```
</details>

**Question**: How easy is it?

<details>
<summary>Click to reveal answer</summary>
It is possible that boundary effects affect model performance -> digits on the top-left corner might not lead to the same activations as those at the bottom-right. In particular max-pooling is not perfectly invariant to translations.  You might want to replace them. An other option is to use data augmentation, e.g. randomly padding the border during model training.
</details>