### Task 4 – MaxPool2d to Strided Convolution Ablation

In [1]:
!pip install --upgrade wandb

Collecting wandb
  Downloading wandb-0.24.0-py3-none-manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading wandb-0.24.0-py3-none-manylinux_2_28_x86_64.whl (22.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.8/22.8 MB[0m [31m75.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: wandb
  Attempting uninstall: wandb
    Found existing installation: wandb 0.22.2
    Uninstalling wandb-0.22.2:
      Successfully uninstalled wandb-0.22.2
Successfully installed wandb-0.24.0


In [2]:
import wandb
wandb.login()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

  2


[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

  ········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjain5[0m ([33mjain5-university-of-potsdam[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
import sys
sys.path.append("/kaggle/input/src-cilp-assessment")

In [4]:
from src import models
import importlib
importlib.reload(models)

from src.models import (
    IntermediateFusionHadamardMaxPool,
    IntermediateFusionHadamardStrided,
)

In [5]:
import os

DATA_ROOT = "/kaggle/input/cilp-assessment-data/assessment"
print("DATA_ROOT exists:", os.path.exists(DATA_ROOT))
print("Cubes RGB:", len(os.listdir(os.path.join(DATA_ROOT, "cubes", "rgb"))))
print("Cubes LiDAR:", len(os.listdir(os.path.join(DATA_ROOT, "cubes", "lidar"))))
print("Spheres RGB:", len(os.listdir(os.path.join(DATA_ROOT, "spheres", "rgb"))))
print("Spheres LiDAR:", len(os.listdir(os.path.join(DATA_ROOT, "spheres", "lidar"))))

DATA_ROOT exists: True
Cubes RGB: 9999
Cubes LiDAR: 9999
Spheres RGB: 9999
Spheres LiDAR: 9999


In [6]:
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image
import numpy as np
from pathlib import Path

In [7]:
class SimpleCILPDataset(Dataset):
    def __init__(self, root, split="train", transform=None, seed=42):
        self.transform = transform
        self.samples = []

        rng = np.random.RandomState(seed)

        for label_name, label_id in [("cubes", 0), ("spheres", 1)]:
            rgb_dir = Path(root) / label_name / "rgb"
            lidar_dir = Path(root) / label_name / "lidar"

            rgb = {p.stem: p for p in rgb_dir.glob("*.png")}
            lidar = {p.stem: p for p in lidar_dir.glob("*.npy")}

            common = sorted(set(rgb) & set(lidar))
            rng.shuffle(common)

            split_idx = int(0.8 * len(common))
            selected = common[:split_idx] if split == "train" else common[split_idx:]

            for stem in selected:
                self.samples.append((
                    rgb[stem],
                    lidar[stem],
                    label_id
                ))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        rgb_path, lidar_path, label = self.samples[idx]

        rgb = Image.open(rgb_path).convert("RGB")
        if self.transform:
            rgb = self.transform(rgb)

        lidar = torch.tensor(np.load(lidar_path), dtype=torch.float32)
        label = torch.tensor(label, dtype=torch.long)

        return rgb, lidar, label

In [8]:
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor()
])

train_dataset = SimpleCILPDataset(DATA_ROOT, split="train", transform=transform)
val_dataset   = SimpleCILPDataset(DATA_ROOT, split="val", transform=transform)

print("Train samples:", len(train_dataset))
print("Val samples:", len(val_dataset))

rgb, lidar, label = train_dataset[0]
print("RGB:", rgb.shape)
print("LiDAR:", lidar.shape)
print("Label:", label)

Train samples: 15998
Val samples: 4000
RGB: torch.Size([3, 128, 128])
LiDAR: torch.Size([64, 64])
Label: tensor(0)


In [9]:
lidar_input_dim = 64 * 64
print("LiDAR input dim:", lidar_input_dim)

LiDAR input dim: 4096


In [10]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
Tesla T4


In [11]:
from torch.utils.data import DataLoader

BATCH_SIZE = 32 

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

In [12]:
rgb, lidar, label = next(iter(train_loader))
print(rgb.shape, lidar.shape, label.shape)
print("LiDAR input dim:", lidar_input_dim)

torch.Size([32, 3, 128, 128]) torch.Size([32, 64, 64]) torch.Size([32])
LiDAR input dim: 4096


In [17]:
def run_epoch(model, loader, criterion, optimizer=None, training=True):
    if training:
        model.train()
    else:
        model.eval()

    total_loss = 0.0
    correct = 0
    total = 0

    with torch.set_grad_enabled(training):
        for rgb, lidar, labels in loader:
            rgb = rgb.to(device)
            lidar = lidar.to(device)
            labels = labels.to(device)

            outputs = model(rgb, lidar)
            loss = criterion(outputs, labels)

            if training:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            total_loss += loss.item() * labels.size(0)
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / total
    accuracy = correct / total
    return avg_loss, accuracy

In [18]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [19]:
import torch.nn as nn
import torch.optim as optim
import time
import wandb

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def train_variant(model_class, variant_name, lidar_input_dim,
                  train_loader, val_loader,
                  embedding_dim=128, num_classes=2,
                  epochs=10, lr=1e-3,
                  project="cilp-extended-assessment"):

    model = model_class(
        lidar_input_dim=lidar_input_dim,
        embedding_dim=embedding_dim,
        num_classes=num_classes,
    ).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    num_params = count_parameters(model)

    wandb.init(
        project=project,
        name=f"task4-{variant_name}",
        config={
            "task": "task4_strided_ablation",
            "fusion_strategy": "intermediate_hadamard",
            "downsampling": variant_name,          # "maxpool" or "strided_conv"
            "model_architecture": model.__class__.__name__,
            "embedding_size": embedding_dim,
            "batch_size": train_loader.batch_size,
            "learning_rate": lr,
            "optimizer": optimizer.__class__.__name__,
            "epochs": epochs,
            "num_parameters": num_params,
            "dataset": "cilp-assessment",
        },
    )

    start_time = time.time()

    for epoch in range(1, epochs + 1):
        train_loss, train_acc = run_epoch(model, train_loader, criterion, optimizer, training=True)
        val_loss, val_acc = run_epoch(model, val_loader, criterion, optimizer, training=False)

        current_lr = optimizer.param_groups[0]["lr"]

        print(
            f"[{variant_name}] Epoch {epoch} "
            f"Train Loss {train_loss:.4f}, Train Acc {train_acc:.4f} "
            f"Val Loss {val_loss:.4f}, Val Acc {val_acc:.4f}"
        )

        wandb.log({
            "epoch": epoch,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "val_loss": val_loss,
            "val_acc": val_acc,
            "learning_rate": current_lr,
        })

    total_training_time = time.time() - start_time
    time_per_epoch = total_training_time / epochs

    logs = {
        "total_training_time_sec": total_training_time,
        "time_per_epoch_sec": time_per_epoch,
    }

    if torch.cuda.is_available():
        max_mem_mb = torch.cuda.max_memory_allocated() / (1024 ** 2)
        logs["max_gpu_memory_mb"] = max_mem_mb

    wandb.log(logs)
    wandb.finish()

    return {
        "variant": variant_name,
        "num_parameters": num_params,
        "total_training_time_sec": total_training_time,
        "time_per_epoch_sec": time_per_epoch,
        "max_gpu_memory_mb": logs.get("max_gpu_memory_mb", None),
        "final_train_loss": train_loss,
        "final_train_acc": train_acc,
        "final_val_loss": val_loss,
        "final_val_acc": val_acc,
    }


In [20]:
results = []

res_maxpool = train_variant(
    IntermediateFusionHadamardMaxPool,
    variant_name="maxpool",
    lidar_input_dim=lidar_input_dim,
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=10,
    lr=1e-3,
)
results.append(res_maxpool)

res_strided = train_variant(
    IntermediateFusionHadamardStrided,
    variant_name="strided_conv",
    lidar_input_dim=lidar_input_dim,
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=10,
    lr=1e-3,
)
results.append(res_strided)

results


[maxpool] Epoch 1 Train Loss 0.3873, Train Acc 0.8274 Val Loss 0.1080, Val Acc 0.9643
[maxpool] Epoch 2 Train Loss 0.0892, Train Acc 0.9692 Val Loss 0.0595, Val Acc 0.9812
[maxpool] Epoch 3 Train Loss 0.0348, Train Acc 0.9878 Val Loss 0.0340, Val Acc 0.9895
[maxpool] Epoch 4 Train Loss 0.0225, Train Acc 0.9927 Val Loss 0.0212, Val Acc 0.9928
[maxpool] Epoch 5 Train Loss 0.0212, Train Acc 0.9924 Val Loss 0.0141, Val Acc 0.9962
[maxpool] Epoch 6 Train Loss 0.0210, Train Acc 0.9937 Val Loss 0.0047, Val Acc 0.9990
[maxpool] Epoch 7 Train Loss 0.0282, Train Acc 0.9878 Val Loss 0.0534, Val Acc 0.9890
[maxpool] Epoch 8 Train Loss 0.0188, Train Acc 0.9939 Val Loss 0.0064, Val Acc 0.9980
[maxpool] Epoch 9 Train Loss 0.0139, Train Acc 0.9951 Val Loss 0.0186, Val Acc 0.9940
[maxpool] Epoch 10 Train Loss 0.0103, Train Acc 0.9970 Val Loss 0.0384, Val Acc 0.9882


0,1
epoch,▁▂▃▃▄▅▆▆▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁
max_gpu_memory_mb,▁
time_per_epoch_sec,▁
total_training_time_sec,▁
train_acc,▁▇████████
train_loss,█▂▁▁▁▁▁▁▁▁
val_acc,▁▄▆▇▇█▆█▇▆
val_loss,█▅▃▂▂▁▄▁▂▃

0,1
epoch,10.0
learning_rate,0.001
max_gpu_memory_mb,314.08105
time_per_epoch_sec,34.98906
total_training_time_sec,349.89065
train_acc,0.997
train_loss,0.01035
val_acc,0.98825
val_loss,0.03839


[strided_conv] Epoch 1 Train Loss 0.3907, Train Acc 0.8258 Val Loss 0.1019, Val Acc 0.9708
[strided_conv] Epoch 2 Train Loss 0.1095, Train Acc 0.9657 Val Loss 0.0752, Val Acc 0.9802
[strided_conv] Epoch 3 Train Loss 0.0339, Train Acc 0.9899 Val Loss 0.0184, Val Acc 0.9935
[strided_conv] Epoch 4 Train Loss 0.0170, Train Acc 0.9943 Val Loss 0.0529, Val Acc 0.9808
[strided_conv] Epoch 5 Train Loss 0.0142, Train Acc 0.9952 Val Loss 0.0151, Val Acc 0.9955
[strided_conv] Epoch 6 Train Loss 0.0112, Train Acc 0.9966 Val Loss 0.0179, Val Acc 0.9940
[strided_conv] Epoch 7 Train Loss 0.0116, Train Acc 0.9959 Val Loss 0.0176, Val Acc 0.9938
[strided_conv] Epoch 8 Train Loss 0.0072, Train Acc 0.9975 Val Loss 0.0059, Val Acc 0.9985
[strided_conv] Epoch 9 Train Loss 0.0086, Train Acc 0.9977 Val Loss 0.0166, Val Acc 0.9962
[strided_conv] Epoch 10 Train Loss 0.0070, Train Acc 0.9977 Val Loss 0.0186, Val Acc 0.9950


0,1
epoch,▁▂▃▃▄▅▆▆▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁
max_gpu_memory_mb,▁
time_per_epoch_sec,▁
total_training_time_sec,▁
train_acc,▁▇████████
train_loss,█▃▁▁▁▁▁▁▁▁
val_acc,▁▃▇▄▇▇▇█▇▇
val_loss,█▆▂▄▂▂▂▁▂▂

0,1
epoch,10.0
learning_rate,0.001
max_gpu_memory_mb,314.08105
time_per_epoch_sec,24.76186
total_training_time_sec,247.61857
train_acc,0.99775
train_loss,0.00704
val_acc,0.995
val_loss,0.01858


[{'variant': 'maxpool',
  'num_parameters': 1208258,
  'total_training_time_sec': 349.8906464576721,
  'time_per_epoch_sec': 34.989064645767215,
  'max_gpu_memory_mb': 314.0810546875,
  'final_train_loss': 0.01034719538028868,
  'final_train_acc': 0.9969996249531191,
  'final_val_loss': 0.03839003745937316,
  'final_val_acc': 0.98825},
 {'variant': 'strided_conv',
  'num_parameters': 1254434,
  'total_training_time_sec': 247.6185712814331,
  'time_per_epoch_sec': 24.761857128143312,
  'max_gpu_memory_mb': 314.0810546875,
  'final_train_loss': 0.007042125709639877,
  'final_train_acc': 0.9977497187148393,
  'final_val_loss': 0.01858437018224143,
  'final_val_acc': 0.995}]

## Task 4 – Strided Convolution Ablation

In this task, I compare two intermediate Hadamard fusion models that differ only in how the RGB encoder downsamples spatially: the baseline uses MaxPool2d, while the variant replaces each pooling operation with a stride‑2 convolution. Both models share the same LiDAR encoder, embedding dimension, fusion MLP, optimizer, learning rate and number of epochs, so differences in performance can be attributed to the downsampling strategy.

| Metric                 | MaxPool2d      | Strided Conv    | Difference (Strided − MaxPool) |
|------------------------|----------------|-----------------|--------------------------------|
| Parameters             | 1,208,258      | 1,254,434       | +46,176                        |
| Max GPU memory (MB)    | 314.08         | 314.08          | 0                              |
| Time per epoch (s)     | 34.99          | 24.76           | −10.23                         |
| Total train time (s)   | 349.89         | 247.62          | −102.27                        |
| Final train loss       | 0.01035        | 0.00704         | −0.00331                       |
| Final val loss         | 0.03839        | 0.01858         | −0.01981                       |
| Final train accuracy   | 0.99700        | 0.99775         | +0.00075                       |
| Final val accuracy     | 0.98825        | 0.99500         | +0.00675                       |

The results show that the strided‑convolution encoder is both more efficient and more accurate than the MaxPool2d baseline on this dataset. Although it uses slightly more parameters (+46k, about 3.8% increase), the strided model reduces time per epoch by roughly 10 seconds and total training time by more than 100 seconds, while keeping GPU memory usage identical. This suggests that replacing discrete pooling with learned stride‑2 convolutions can exploit GPU throughput better and avoid some of the information loss introduced by hard pooling operations.

From a generalization perspective, the strided model attains lower training and validation losses and improves validation accuracy from 98.8% to 99.5%, indicating a clear performance benefit rather than mere overfitting. Intuitively, the stride‑2 convolutions allow the network to learn how to downsample and preserve task‑relevant structure, instead of applying a fixed max operation that might discard useful shape cues. Given these findings, I would choose the strided‑convolution encoder as the preferred design for subsequent experiments, since it offers better accuracy and faster training with only a modest increase in parameter count.
