# Image classification with a small CNN (cats vs dogs)

Adapted from Chapter 8 of François Chollet's *Deep Learning with Python*.

This version **does not rely on Kaggle**. Instead, it uses a public Google-hosted
mirror of the filtered cats vs dogs dataset:
https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip

It also creates a proper **train / validation / test** split by moving a subset of
images into a new `test/` directory:

- 200 cats + 200 dogs from `train/`
- 100 cats + 100 dogs from `validation/`

You can run this notebook directly in Colab. A GPU is recommended but not required.

In [None]:
import os
import pathlib
import zipfile
import random
import shutil
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator

print("TensorFlow version:", tf.__version__)


## Download and extract the cats vs dogs dataset (Google mirror)

We download the zip file to Keras' cache directory and then **explicitly extract** it.
After extraction, we robustly locate the `cats_and_dogs_filtered` directory.

In [None]:
from pathlib import Path

URL = "https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip"

# Download the zip file (without auto-extraction)
zip_path = tf.keras.utils.get_file(
    "cats_and_dogs_filtered.zip",
    origin=URL,
    extract=False,
)
zip_path = Path(zip_path)
download_root = zip_path.parent
print("Zip path:", zip_path)
print("Download root:", download_root)

# Explicitly extract the zip under download_root
with zipfile.ZipFile(zip_path, "r") as zf:
    zf.extractall(download_root)

# Locate the extracted 'cats_and_dogs_filtered' directory robustly
candidates = [p for p in download_root.rglob("cats_and_dogs_filtered") if p.is_dir()]
if not candidates:
    raise RuntimeError(f"Could not find 'cats_and_dogs_filtered' under {download_root}")

base_dir = candidates[0]
print("Using base_dir:", base_dir)

train_dir = base_dir / "train"
validation_dir = base_dir / "validation"

train_cats_dir = train_dir / "cats"
train_dogs_dir = train_dir / "dogs"
validation_cats_dir = validation_dir / "cats"
validation_dogs_dir = validation_dir / "dogs"

# Count images before any splitting
num_train_cats = len(list(train_cats_dir.glob("*.jpg")))
num_train_dogs = len(list(train_dogs_dir.glob("*.jpg")))
num_val_cats = len(list(validation_cats_dir.glob("*.jpg")))
num_val_dogs = len(list(validation_dogs_dir.glob("*.jpg")))

print("BEFORE SPLIT:")
print("Training cats:", num_train_cats)
print("Training dogs:", num_train_dogs)
print("Validation cats:", num_val_cats)
print("Validation dogs:", num_val_dogs)

# Optional quick directory peek if counts are 0
if (
    num_train_cats == 0
    and num_train_dogs == 0
    and num_val_cats == 0
    and num_val_dogs == 0
):
    print("\nFirst 40 paths under base_dir for debugging:")
    for p in list(base_dir.rglob("*"))[:40]:
        print("  ", p)


## Create a proper test split

We create a new `test/` directory and **move** images into it so the test data is
never seen during training:

- 200 cats + 200 dogs from `train/`
- 100 cats + 100 dogs from `validation/`

In [None]:
# Reproducibility
random.seed(1337)

# Create test directories
test_dir = base_dir / "test"
test_cats_dir = test_dir / "cats"
test_dogs_dir = test_dir / "dogs"

for d in [test_dir, test_cats_dir, test_dogs_dir]:
    d.mkdir(parents=True, exist_ok=True)

def move_subset(src_dir, dst_dir, n):
    files = list(src_dir.glob("*.jpg"))
    if len(files) < n:
        print(f"Warning: requested {n} files from {src_dir}, but only {len(files)} available. Using all.")
        n = len(files)
    selected = random.sample(files, n)
    for p in selected:
        shutil.move(str(p), dst_dir / p.name)

# 200 from train (cats & dogs)
move_subset(train_cats_dir, test_cats_dir, 200)
move_subset(train_dogs_dir, test_dogs_dir, 200)

# 100 from validation (cats & dogs)
move_subset(validation_cats_dir, test_cats_dir, 100)
move_subset(validation_dogs_dir, test_dogs_dir, 100)

# Recount after the move
num_train_cats = len(list(train_cats_dir.glob("*.jpg")))
num_train_dogs = len(list(train_dogs_dir.glob("*.jpg")))
num_val_cats = len(list(validation_cats_dir.glob("*.jpg")))
num_val_dogs = len(list(validation_dogs_dir.glob("*.jpg")))
num_test_cats = len(list(test_cats_dir.glob("*.jpg")))
num_test_dogs = len(list(test_dogs_dir.glob("*.jpg")))

print("AFTER SPLIT:")
print("Training cats:", num_train_cats)
print("Training dogs:", num_train_dogs)
print("Validation cats:", num_val_cats)
print("Validation dogs:", num_val_dogs)
print("Test cats:", num_test_cats)
print("Test dogs:", num_test_dogs)


## Visualize a few training images

Quick sanity check: we sample a few images from both classes.

In [None]:
import matplotlib.image as mpimg
import numpy as np

nrows, ncols = 3, 3
fig, axes = plt.subplots(nrows, ncols, figsize=(8, 8))
axes = axes.flatten()

cat_images = list(train_cats_dir.glob("*.jpg"))[: nrows * ncols // 2]
dog_images = list(train_dogs_dir.glob("*.jpg"))[: nrows * ncols - len(cat_images)]
all_images = cat_images + dog_images

for ax, img_path in zip(axes, all_images):
    img = mpimg.imread(str(img_path))
    ax.imshow(img)
    ax.axis("off")
    ax.set_title("cat" if "cat" in img_path.name else "dog")

plt.tight_layout()
plt.show()


## Set up data generators (train / validation / test)

We use `ImageDataGenerator` to rescale pixel values and stream images from disk.

In [None]:
img_height = 150
img_width = 150
batch_size = 32

train_datagen = ImageDataGenerator(rescale=1.0 / 255)
val_datagen = ImageDataGenerator(rescale=1.0 / 255)
test_datagen = ImageDataGenerator(rescale=1.0 / 255)

train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode="binary",
)

validation_generator = val_datagen.flow_from_directory(
    validation_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode="binary",
)

test_generator = test_datagen.flow_from_directory(
    test_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode="binary",
    shuffle=False,  # important if you later want to inspect predictions in order
)


## Build a small convolutional neural network (Functional API)

This is the same architecture as a simple `Sequential` convnet, but written in
Keras' **Functional API** style, which makes the computation graph explicit.

In [None]:
inputs = keras.Input(shape=(img_height, img_width, 3))

x = layers.Conv2D(32, (3, 3), activation="relu")(inputs)
x = layers.MaxPooling2D((2, 2))(x)

x = layers.Conv2D(64, (3, 3), activation="relu")(x)
x = layers.MaxPooling2D((2, 2))(x)

x = layers.Conv2D(128, (3, 3), activation="relu")(x)
x = layers.MaxPooling2D((2, 2))(x)

x = layers.Conv2D(128, (3, 3), activation="relu")(x)
x = layers.MaxPooling2D((2, 2))(x)

x = layers.Flatten()(x)
x = layers.Dense(512, activation="relu")(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs=inputs, outputs=outputs)
model.summary()


## Train the model

We use binary crossentropy loss because this is a two-class problem.

In [None]:
model.compile(
    loss="binary_crossentropy",
    optimizer=keras.optimizers.RMSprop(learning_rate=1e-4),
    metrics=["accuracy"],
)

epochs = 20

history = model.fit(
    train_generator,
    epochs=epochs,
    validation_data=validation_generator,
)


## Plot training and validation curves

This helps you diagnose overfitting and underfitting.

In [None]:
acc = history.history["accuracy"]
val_acc = history.history["val_accuracy"]
loss = history.history["loss"]
val_loss = history.history["val_loss"]

epochs_range = range(len(acc))

plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label="Training acc")
plt.plot(epochs_range, val_acc, label="Validation acc")
plt.legend()
plt.title("Training and validation accuracy")

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label="Training loss")
plt.plot(epochs_range, val_loss, label("Validation loss"))
plt.legend()
plt.title("Training and validation loss")

plt.tight_layout()
plt.show()


## Evaluate on the held-out test set

Finally, we evaluate the trained model on the `test/` directory that was never
used during training.

In [None]:
test_loss, test_acc = model.evaluate(test_generator)
print("Test accuracy:", test_acc)
print("Test loss:", test_loss)
