---
title: Using ConvNets with Small Datasets
---

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pantelis/aiml-common/blob/master/lectures/cnn/cnn-example-architectures/using_convnets_with_small_datasets.ipynb)

This notebook is a PyTorch adaptation of the canonical small-dataset convnet example from
[Deep Learning with Python](https://www.manning.com/books/deep-learning-with-python) (F. Chollet, Chapter 5).

We use the `pantelism/cats-vs-dogs` dataset hosted on Hugging Face (the same 4,000-image Kaggle
subset used in the original) and demonstrate:

1. **Baseline**: training a small convnet from scratch → clear overfitting with only 2,000 training samples
2. **Regularisation**: data augmentation + dropout → substantially lower validation loss and higher accuracy

The trained model is saved as `cats_and_dogs_small.pth` for use by the companion
visualisation notebook.


In [None]:
!pip install huggingface_hub scikit-learn seaborn --quiet


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms
from sklearn.metrics import confusion_matrix, roc_curve
import seaborn as sns

# ── Config ──────────────────────────────────────────────────────────────────
IMG_SIZE        = 150
BATCH_SIZE      = 32
EPOCHS_BASELINE = 20   # enough to show overfitting clearly
EPOCHS_AUG      = 30   # enough to show regularisation benefit
LR              = 1e-4
SEED            = 42

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {DEVICE}")


## Dataset

`pantelism/cats-vs-dogs` hosts the exact 4,000-image Kaggle subset used in the original
Chollet notebook as a single ZIP file (`dogs-vs-cats-subset.zip`).
The archive already contains pre-split `train/` (2,000), `validation/` (1,000), and
`test/` (1,000) folders, each with `cats/` and `dogs/` sub-directories.

We download it via `hf_hub_download` and load directly with
`torchvision.datasets.ImageFolder` — no manual splitting required.


In [None]:
import os, zipfile
from huggingface_hub import hf_hub_download
from torchvision.datasets import ImageFolder

# ── Download ZIP from Hugging Face Hub ───────────────────────────────────────
zip_path = hf_hub_download(
    repo_id="pantelism/cats-vs-dogs",
    filename="dogs-vs-cats-subset.zip",
    repo_type="dataset",
)
print(f"ZIP path: {zip_path}")

# ── Extract once (ZIP root is subset/) ───────────────────────────────────────
extract_dir = "/tmp/cats-vs-dogs"
if not os.path.exists(extract_dir):
    with zipfile.ZipFile(zip_path, "r") as zf:
        zf.extractall(extract_dir)
    print("Extracted dataset")
else:
    print("Dataset already extracted")

# ZIP extracts to: extract_dir/subset/train|validation|test/cats|dogs/
base_dir  = os.path.join(extract_dir, "subset")
train_dir = os.path.join(base_dir, "train")
val_dir   = os.path.join(base_dir, "validation")
test_dir  = os.path.join(base_dir, "test")
print(f"Train cats: {len(os.listdir(os.path.join(train_dir,'cats')))}, "
      f"dogs: {len(os.listdir(os.path.join(train_dir,'dogs')))}")

# ── Transforms ───────────────────────────────────────────────────────────────
basic_tf = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
])

aug_tf = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(40),
    transforms.RandomAffine(degrees=0, translate=(0.2, 0.2), shear=20),
    transforms.RandomResizedCrop(IMG_SIZE, scale=(0.8, 1.0)),
    transforms.ColorJitter(brightness=0.1, contrast=0.1),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
])

# ── DataLoaders ──────────────────────────────────────────────────────────────
train_ds_basic = ImageFolder(train_dir, transform=basic_tf)
train_ds_aug   = ImageFolder(train_dir, transform=aug_tf)
val_ds         = ImageFolder(val_dir,   transform=basic_tf)
test_ds        = ImageFolder(test_dir,  transform=basic_tf)

label_names = train_ds_basic.classes   # ['cats', 'dogs']

def make_loader(ds, shuffle=False):
    return DataLoader(ds, batch_size=BATCH_SIZE, shuffle=shuffle,
                      num_workers=2, pin_memory=True)

train_loader_basic = make_loader(train_ds_basic, shuffle=True)
train_loader_aug   = make_loader(train_ds_aug,   shuffle=True)
val_loader         = make_loader(val_ds)
test_loader        = make_loader(test_ds)

print(f"Train {len(train_ds_basic)} | Val {len(val_ds)} | Test {len(test_ds)}")
print("Classes:", label_names)

imgs, labels = next(iter(train_loader_basic))
print(f"Batch shape: {imgs.shape}, Labels: {labels[:8].tolist()}")


## Model architecture

We replicate the Chollet convnet — four `Conv2d → ReLU → MaxPool2d` blocks that
progressively increase depth (32 → 64 → 128 → 128) while halving spatial dimensions
(150 → 74 → 36 → 17 → 7), followed by a fully-connected head.

An optional `Dropout(0.5)` layer is inserted before the first dense layer for the
regularised variant.

```
Input 3×150×150
  Conv2d(3→32, k=3)  → ReLU → MaxPool2d(2)   →  32×74×74
  Conv2d(32→64, k=3) → ReLU → MaxPool2d(2)   →  64×36×36
  Conv2d(64→128,k=3) → ReLU → MaxPool2d(2)   → 128×17×17
  Conv2d(128→128,k=3)→ ReLU → MaxPool2d(2)   → 128×7×7
  Flatten → [Dropout(0.5)] → Linear(6272→512) → ReLU → Linear(512→1)
```


In [None]:
class SmallConvNet(nn.Module):
    def __init__(self, dropout: bool = False):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3,   32,  3), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32,  64,  3), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(64,  128, 3), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(128, 128, 3), nn.ReLU(), nn.MaxPool2d(2),
        )
        self.classifier = nn.Sequential(
            nn.Dropout(0.5) if dropout else nn.Identity(),
            nn.Linear(128 * 7 * 7, 512),
            nn.ReLU(),
            nn.Linear(512, 1),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.features(x)
        x = x.view(x.size(0), -1)
        return self.classifier(x).squeeze(1)   # shape (B,)

# Verify output shape
_dummy = torch.zeros(2, 3, IMG_SIZE, IMG_SIZE)
assert SmallConvNet()(_dummy).shape == (2,), "unexpected output shape"
print("Architecture verified — output shape (B,) ✓")
print(SmallConvNet())


## Baseline: training from scratch with no regularisation

We train for 20 epochs with RMSprop and binary cross-entropy loss.
With only 2,000 training samples the network overfits quickly:
training accuracy climbs to ~95% while validation accuracy plateaus
around 70–72%, a textbook overfitting signature.


In [None]:
def train_model(model, train_loader, val_loader, epochs, lr=LR):
    model = model.to(DEVICE)
    criterion = nn.BCEWithLogitsLoss()
    optimiser = torch.optim.RMSprop(model.parameters(), lr=lr)

    history = dict(train_loss=[], val_loss=[], train_acc=[], val_acc=[])

    for epoch in range(epochs):
        # ── Training pass ────────────────────────────────────────────────────
        model.train()
        t_loss = t_correct = t_n = 0
        for imgs, labels in train_loader:
            imgs   = imgs.to(DEVICE)
            labels = labels.float().to(DEVICE)   # ImageFolder returns Long; BCE needs Float
            optimiser.zero_grad()
            logits = model(imgs)
            loss   = criterion(logits, labels)
            loss.backward()
            optimiser.step()
            t_loss    += loss.item() * len(imgs)
            t_correct += ((logits > 0) == labels.bool()).sum().item()
            t_n       += len(imgs)

        # ── Validation pass ──────────────────────────────────────────────────
        model.eval()
        v_loss = v_correct = v_n = 0
        with torch.no_grad():
            for imgs, labels in val_loader:
                imgs   = imgs.to(DEVICE)
                labels = labels.float().to(DEVICE)
                logits = model(imgs)
                v_loss    += criterion(logits, labels).item() * len(imgs)
                v_correct += ((logits > 0) == labels.bool()).sum().item()
                v_n       += len(imgs)

        history["train_loss"].append(t_loss / t_n)
        history["train_acc"].append(t_correct / t_n)
        history["val_loss"].append(v_loss / v_n)
        history["val_acc"].append(v_correct / v_n)

        if (epoch + 1) % 5 == 0 or epoch == 0:
            print(
                f"Epoch {epoch+1:3d}/{epochs}  "
                f"loss {history['train_loss'][-1]:.4f}  acc {history['train_acc'][-1]:.3f}  |  "
                f"val_loss {history['val_loss'][-1]:.4f}  val_acc {history['val_acc'][-1]:.3f}"
            )
    return history


def plot_history(history, title, save_path=None):
    epochs = range(1, len(history["train_acc"]) + 1)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    ax1.plot(epochs, history["train_acc"], "bo-", label="Training")
    ax1.plot(epochs, history["val_acc"],   "b-",  label="Validation")
    ax1.set_title(f"{title} — Accuracy"); ax1.set_xlabel("Epoch"); ax1.legend()
    ax2.plot(epochs, history["train_loss"], "ro-", label="Training")
    ax2.plot(epochs, history["val_loss"],   "r-",  label="Validation")
    ax2.set_title(f"{title} — Loss"); ax2.set_xlabel("Epoch"); ax2.legend()
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=120, bbox_inches="tight")
    plt.show()


torch.manual_seed(SEED)
model_baseline = SmallConvNet(dropout=False)
print("Training baseline …")
hist_baseline = train_model(model_baseline, train_loader_basic, val_loader, EPOCHS_BASELINE)
plot_history(hist_baseline, "Baseline (no augmentation)", "baseline_curves.png")


## Data augmentation + dropout

Data augmentation generates new views of each training image on-the-fly — random
horizontal flips, rotations, translations, shears, and crop-resizes — so the model
never sees the exact same pixel pattern twice.  Combined with `Dropout(0.5)`, this
substantially reduces the train-validation gap characteristic of overfitting.


In [None]:
torch.manual_seed(SEED)
model_aug = SmallConvNet(dropout=True)
print("Training augmented model (data augmentation + dropout) …")
hist_aug = train_model(model_aug, train_loader_aug, val_loader, EPOCHS_AUG)
plot_history(hist_aug, "Augmentation + Dropout", "augmented_curves.png")


## Evaluation on the held-out test set

We evaluate the regularised model on the 1,000-image test split and report:

- **Confusion matrix** — to see which mistakes are made
- **ROC curve** — to characterise the trade-off across thresholds
- **Test accuracy** — headline metric

The model is saved as `cats_and_dogs_small.pth` for the companion
visualisation notebook.


In [None]:
# ── Save model ───────────────────────────────────────────────────────────────
torch.save(model_aug.state_dict(), "cats_and_dogs_small.pth")
print("Saved cats_and_dogs_small.pth")

# ── Collect predictions ──────────────────────────────────────────────────────
model_aug.eval()
all_labels, all_probs = [], []
with torch.no_grad():
    for imgs, labels in test_loader:
        logits = model_aug(imgs.to(DEVICE))
        probs  = torch.sigmoid(logits).cpu().numpy()
        all_probs.extend(probs)
        all_labels.extend(labels.numpy())

all_labels = np.array(all_labels, dtype=int)
all_probs  = np.array(all_probs)
preds      = (all_probs > 0.5).astype(int)

# ── Confusion matrix + ROC ───────────────────────────────────────────────────
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

cm = confusion_matrix(all_labels, preds)
sns.heatmap(cm, annot=True, fmt="d", ax=axes[0], cmap="Blues",
            xticklabels=label_names, yticklabels=label_names)
axes[0].set_title("Confusion matrix (test set)")
axes[0].set_ylabel("True label"); axes[0].set_xlabel("Predicted label")

fp, tp, _ = roc_curve(all_labels, all_probs)
axes[1].plot(100 * fp, 100 * tp, linewidth=2)
axes[1].set_xlabel("False positive rate [%]"); axes[1].set_ylabel("True positive rate [%]")
axes[1].set_title("ROC curve"); axes[1].grid(True)

plt.tight_layout()
plt.savefig("evaluation.png", dpi=120, bbox_inches="tight")
plt.show()

acc = (preds == all_labels).mean()
print(f"Test accuracy: {acc:.3f}")
