# EuroSAT CNN Architecture Experiments

## Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import regularizers
from tensorflow.keras import layers
import time

plt.style.use("seaborn-v0_8-darkgrid")
sns.set_palette("husl")

%matplotlib inline

## Load dataset

In [None]:
def load_eurosat_dataset(data_dir="data"):
    data_path = Path(data_dir)

    # Get all class directories
    class_dirs = [d for d in data_path.iterdir() if d.is_dir()]
    class_names = sorted([d.name for d in class_dirs])

    print(f"Found {len(class_names)} classes: {class_names}")

    images = []
    labels = []

    # Load images from each class
    for class_idx, class_name in enumerate(class_names):
        class_path = data_path / class_name
        image_files = list(class_path.glob("*.jpg")) + list(class_path.glob("*.png"))

        print(f"Loading {len(image_files)} images from {class_name}...")

        for img_path in image_files:
            try:
                # Load image
                img = Image.open(img_path)
                img_array = np.array(img)

                # Store image and label
                images.append(img_array)
                labels.append(class_idx)
            except Exception as e:
                print(f"Error loading {img_path}: {e}")

    # Convert to numpy arrays
    data = np.array(images)
    labels = np.array(labels)

    print(f"\nDataset loaded successfully!")
    print(f"Total images           : {len(data)}")
    print(f"Data shape             : {data.shape}")
    print(f"Labels shape           : {labels.shape}")

    return data, labels, class_names


# Load the dataset
data, labels, class_names = load_eurosat_dataset("data")

## Data preprocessing

In [None]:
# Normalize pixel values to [0, 1]
X = data.astype("float32") / 255.0
y = labels

print(f"Data shape             : {X.shape}")
print(f"Data dtype             : {X.dtype}")
print(f"Data range             : [{X.min():.2f}, {X.max():.2f}]")
print(f"Number of classes      : {len(class_names)}")

## Data split

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.13, random_state=42, stratify=y_train
)

print(f"Training set size      : {X_train.shape[0]} samples")
print(f"Validation set size    : {X_val.shape[0]} samples")
print(f"Test set size          : {X_test.shape[0]} samples")
print(f"Image shape            : {X_train.shape[1:]}")
print(f"Number of classes      : {len(class_names)}")

## Helper functions

In [None]:
class SparseF1Score(keras.metrics.Metric):
    """F1 Score metric that works with sparse (integer) labels."""

    def __init__(self, num_classes, average="weighted", name="f1_score", **kwargs):
        super().__init__(name=name, **kwargs)
        self.num_classes = num_classes
        self.f1_metric = keras.metrics.F1Score(average=average)

    def update_state(self, y_true, y_pred, sample_weight=None):
        # Convert sparse labels to one-hot encoding
        y_true_one_hot = tf.one_hot(tf.cast(y_true, tf.int32), self.num_classes)
        self.f1_metric.update_state(y_true_one_hot, y_pred, sample_weight)

    def result(self):
        return self.f1_metric.result()

    def reset_state(self):
        self.f1_metric.reset_state()


class F1Callback(keras.callbacks.Callback):
    """Callback to track F1 score during training for early stopping."""

    def __init__(self):
        super().__init__()
        self.best_f1 = 0

    def on_epoch_end(self, epoch, logs=None):
        val_f1 = logs.get("val_f1_score")
        if val_f1 and val_f1 > self.best_f1:
            self.best_f1 = val_f1

In [None]:
def evaluate_model(
    model, X_train, X_val, y_train, y_val, model_name, epochs=20, batch_size=32
):
    print(f"\nTraining {model_name}...")

    # Early stopping on validation F1 score
    early_stopping = keras.callbacks.EarlyStopping(
        monitor="val_f1_score",
        patience=5,
        restore_best_weights=True,
        mode="max",
        verbose=1,
    )

    # Train
    start_time = time.time()
    history = model.fit(
        X_train,
        y_train,
        batch_size=batch_size,
        epochs=epochs,
        validation_data=(X_val, y_val),
        callbacks=[early_stopping],
        verbose=1,
    )
    train_time = time.time() - start_time

    # Predict on validation set
    start_time = time.time()
    y_pred_probs = model.predict(X_val, verbose=0)
    y_pred = np.argmax(y_pred_probs, axis=1)
    predict_time = time.time() - start_time

    # Calculate metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average="weighted")
    recall = recall_score(y_val, y_pred, average="weighted")
    f1 = f1_score(y_val, y_pred, average="weighted")

    # Print results
    print(f"\nTraining time          : {train_time:.2f} seconds")
    print(f"Prediction time        : {predict_time:.2f} seconds")

    print(f"\nValidation Set Performance Metrics:")
    print(f"  Accuracy             : {accuracy:.4f}")
    print(f"  Precision            : {precision:.4f}")
    print(f"  Recall               : {recall:.4f}")
    print(f"  F1-Score             : {f1:.4f}")

    # Classification report
    print(f"\nClassification Report:")
    print(classification_report(y_val, y_pred, target_names=class_names))

    return {
        "model": model,
        "model_name": model_name,
        "history": history,
        "y_pred": y_pred,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "train_time": train_time,
        "predict_time": predict_time,
    }

In [None]:
def plot_confusion_matrix(y_val, y_pred, class_names, model_name):
    cm = confusion_matrix(y_val, y_pred)

    plt.figure(figsize=(12, 10))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=class_names,
        yticklabels=class_names,
        cbar_kws={"label": "Number of samples"},
    )
    plt.xlabel("Predicted Label", fontsize=12, fontweight="bold")
    plt.ylabel("True Label", fontsize=12, fontweight="bold")
    plt.title(f"Confusion Matrix - {model_name}", fontsize=14, fontweight="bold")
    plt.xticks(rotation=45, ha="right")
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

    return cm


def plot_training_curves(history, model_name):
    """Plot loss and F1 score curves for a model."""
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # Loss curves
    axes[0].plot(history.history["loss"], label="Train Loss", linewidth=2)
    axes[0].plot(history.history["val_loss"], label="Val Loss", linewidth=2)
    axes[0].set_xlabel("Epoch", fontsize=12, fontweight="bold")
    axes[0].set_ylabel("Loss", fontsize=12, fontweight="bold")
    axes[0].set_title(f"Loss Curves - {model_name}", fontsize=13, fontweight="bold")
    axes[0].legend()
    axes[0].grid(alpha=0.3)

    # F1 score curves
    axes[1].plot(history.history["f1_score"], label="Train F1", linewidth=2)
    axes[1].plot(history.history["val_f1_score"], label="Val F1", linewidth=2)
    axes[1].set_xlabel("Epoch", fontsize=12, fontweight="bold")
    axes[1].set_ylabel("F1 Score", fontsize=12, fontweight="bold")
    axes[1].set_title(f"F1 Score Curves - {model_name}", fontsize=13, fontweight="bold")
    axes[1].legend()
    axes[1].grid(alpha=0.3)

    plt.tight_layout()
    plt.show()

## Experiment 1: Network Depth

In this experiment, we investigate the optimal depth of the CNN by varying the number of Conv2D + MaxPooling blocks. 

We test architectures with **2, 3, and 4** Conv2D + MaxPooling blocks. Each configuration has:
- **ELU activation** function
- **He initialization** for weights
- **Batch Normalization** before and after each convolutional layer
- **Early stopping** on validation F1 score

In [None]:
def build_depth_model(
    num_blocks, num_classes, input_shape=(64, 64, 3), base_filters=16
):
    """
    Build a CNN with variable depth (number of Conv2D + MaxPooling blocks).
    Uses ELU activation, He initialization, and Batch Normalization.

    Args:
        num_blocks: Number of Conv2D + MaxPooling blocks (2-5)
        num_classes: Number of output classes
        input_shape: Shape of input images
        base_filters: Number of filters (constant across all blocks)

    Returns:
        Compiled Keras model
    """
    model = keras.Sequential(name=f"depth_model_{num_blocks}_blocks")
    model.add(layers.Input(shape=input_shape))

    for block in range(num_blocks):
        # Batch Normalization before Conv2D
        model.add(layers.BatchNormalization(name=f"block{block+1}_bn_pre"))

        # Conv2D layer with ELU activation and He initialization
        model.add(
            layers.Conv2D(
                base_filters,
                (3, 3),
                activation="elu",
                padding="same",
                kernel_initializer="he_normal",
                name=f"block{block+1}_conv",
            )
        )

        # Batch Normalization after Conv2D
        model.add(layers.BatchNormalization(name=f"block{block+1}_bn_post"))

        # MaxPooling layer
        model.add(layers.MaxPooling2D((2, 2), name=f"block{block+1}_maxpool"))

    # Flatten and dense layers
    model.add(layers.Flatten(name="flatten"))
    model.add(
        layers.Dense(
            128, activation="elu", kernel_initializer="he_normal", name="dense_1"
        )
    )
    model.add(layers.BatchNormalization(name="dense_1_bn"))
    model.add(
        layers.Dense(
            128, activation="elu", kernel_initializer="he_normal", name="dense_2"
        )
    )
    model.add(layers.BatchNormalization(name="dense_2_bn"))
    model.add(layers.Dense(num_classes, activation="softmax", name="output"))

    model.compile(
        optimizer="adam",
        loss="sparse_categorical_crossentropy",
        metrics=[SparseF1Score(num_classes=num_classes, average="weighted")],
    )

    return model

In [None]:
# Test different network depths (2-4 blocks)
depth_configs = [2, 3, 4]
depth_results = []

for num_blocks in depth_configs:
    print(f"Testing CNN with {num_blocks} Conv2D + MaxPooling block(s)")

    # Build model
    model = build_depth_model(num_blocks, len(class_names))

    # Print architecture
    print(f"\nArchitecture Summary:")
    model.summary()

    # Train and evaluate
    result = evaluate_model(
        model,
        X_train,
        X_val,
        y_train,
        y_val,
        f"CNN Depth {num_blocks}",
        epochs=30,
    )
    result["num_blocks"] = num_blocks
    depth_results.append(result)

### Experiment 1: Results Summary

In [None]:
# Compare depth experiment results in a table
depth_summary = pd.DataFrame(
    {
        "Blocks": [r["num_blocks"] for r in depth_results],
        "Model": [r["model_name"] for r in depth_results],
        "Val Accuracy": [f"{r['accuracy']:.4f}" for r in depth_results],
        "Val Precision": [f"{r['precision']:.4f}" for r in depth_results],
        "Val Recall": [f"{r['recall']:.4f}" for r in depth_results],
        "Val F1-Score": [f"{r['f1']:.4f}" for r in depth_results],
        "Train Time (s)": [f"{r['train_time']:.2f}" for r in depth_results],
        "Predict Time (s)": [f"{r['predict_time']:.2f}" for r in depth_results],
    }
)

print("Experiment 1: Network Depth Results")
display(depth_summary)

In [None]:
# Save Experiment 1 results to CSV
depth_summary.to_csv("results/esults_experiment1_depth.csv", index=False)
print("\nResults saved to: results_experiment1_depth.csv")

In [None]:
# Visualize depth experiment results with metrics
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

depth_blocks = [r["num_blocks"] for r in depth_results]
depth_f1 = [r["f1"] for r in depth_results]
depth_accuracy = [r["accuracy"] for r in depth_results]
depth_train_time = [r["train_time"] for r in depth_results]

# F1-Score vs Depth
axes[0, 0].plot(
    depth_blocks,
    depth_f1,
    marker="o",
    linewidth=2,
    markersize=8,
)
axes[0, 0].set_xlabel("Number of Blocks", fontsize=12, fontweight="bold")
axes[0, 0].set_ylabel("F1-Score", fontsize=12, fontweight="bold")
axes[0, 0].set_title("F1-Score vs Network Depth", fontsize=14, fontweight="bold")
axes[0, 0].grid(alpha=0.3)
axes[0, 0].set_xticks(depth_blocks)

# Accuracy vs Depth
axes[0, 1].plot(
    depth_blocks,
    depth_accuracy,
    marker="s",
    linewidth=2,
    markersize=8,
    color="green",
)
axes[0, 1].set_xlabel("Number of Blocks", fontsize=12, fontweight="bold")
axes[0, 1].set_ylabel("Accuracy", fontsize=12, fontweight="bold")
axes[0, 1].set_title("Accuracy vs Network Depth", fontsize=14, fontweight="bold")
axes[0, 1].grid(alpha=0.3)
axes[0, 1].set_xticks(depth_blocks)

# Training Time vs Depth
axes[1, 0].bar(
    depth_blocks,
    depth_train_time,
    color="coral",
    edgecolor="black",
)
axes[1, 0].set_xlabel("Number of Blocks", fontsize=12, fontweight="bold")
axes[1, 0].set_ylabel("Training Time (s)", fontsize=12, fontweight="bold")
axes[1, 0].set_title("Training Time vs Network Depth", fontsize=14, fontweight="bold")
axes[1, 0].grid(axis="y", alpha=0.3)
axes[1, 0].set_xticks(depth_blocks)

# All metrics comparison
x = np.arange(len(depth_blocks))
width = 0.2
axes[1, 1].bar(x - width, depth_accuracy, width, label="Accuracy")
axes[1, 1].bar(x, [r["precision"] for r in depth_results], width, label="Precision")
axes[1, 1].bar(x + width, [r["recall"] for r in depth_results], width, label="Recall")
axes[1, 1].set_xlabel("Number of Blocks", fontsize=12, fontweight="bold")
axes[1, 1].set_ylabel("Score", fontsize=12, fontweight="bold")
axes[1, 1].set_title("Metrics Comparison by Depth", fontsize=14, fontweight="bold")
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(depth_blocks)
axes[1, 1].legend()
axes[1, 1].grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Training curves for all depth configurations
for result in depth_results:
    print(f"\nTraining Curves for: {result['model_name']}")
    plot_training_curves(result["history"], result["model_name"])

# Find best depth based on F1-Score
best_depth_idx = np.argmax([r["f1"] for r in depth_results])
best_depth = depth_results[best_depth_idx]["num_blocks"]
best_depth_f1 = depth_results[best_depth_idx]["f1"]

In [None]:
print(f"Best network depth     : {best_depth} block(s)")
print(f"Best F1-Score          : {best_depth_f1:.4f}")
print(f"This depth will be used for Experiment 2 (Block Width).")

## Experiment 2: Block Width

In this experiment, we investigate the optimal width of each block by varying the number of Conv2D layers within each block before the MaxPooling layer.

Using the best depth from Experiment 1, we test architectures with **2, 3, and 4** Conv2D layers per block. All configurations use:
- **ELU activation** function
- **He initialization** for weights
- **Batch Normalization** before and after each convolutional layer
- **Early stopping** on validation F1 score

In [None]:
def build_width_model(
    num_blocks, convs_per_block, num_classes, input_shape=(64, 64, 3), base_filters=16
):
    """
    Build a CNN with variable width (number of Conv2D layers per block).
    Uses ELU activation, He initialization, and Batch Normalization.

    Args:
        num_blocks: Number of blocks (Conv2D layers + MaxPooling)
        convs_per_block: Number of Conv2D layers in each block before MaxPooling
        num_classes: Number of output classes
        input_shape: Shape of input images
        base_filters: Number of filters (constant across all blocks)

    Returns:
        Compiled Keras model
    """
    model = keras.Sequential(name=f"width_model_{convs_per_block}_convs")
    model.add(layers.Input(shape=input_shape))

    for block in range(num_blocks):
        # Add multiple Conv2D layers per block
        for conv in range(convs_per_block):
            # Batch Normalization before Conv2D
            model.add(
                layers.BatchNormalization(name=f"block{block+1}_conv{conv+1}_bn_pre")
            )

            # Conv2D layer with ELU activation and He initialization
            model.add(
                layers.Conv2D(
                    base_filters,
                    (3, 3),
                    activation="elu",
                    padding="same",
                    kernel_initializer="he_normal",
                    name=f"block{block+1}_conv{conv+1}",
                )
            )

            # Batch Normalization after Conv2D
            model.add(
                layers.BatchNormalization(name=f"block{block+1}_conv{conv+1}_bn_post")
            )

        # MaxPooling layer after all convs in the block
        model.add(layers.MaxPooling2D((2, 2), name=f"block{block+1}_maxpool"))

    # Flatten and dense layers
    model.add(layers.Flatten(name="flatten"))
    model.add(
        layers.Dense(
            128, activation="elu", kernel_initializer="he_normal", name="dense_1"
        )
    )
    model.add(layers.BatchNormalization(name="dense_1_bn"))
    model.add(
        layers.Dense(
            128, activation="elu", kernel_initializer="he_normal", name="dense_2"
        )
    )
    model.add(layers.BatchNormalization(name="dense_2_bn"))
    model.add(layers.Dense(num_classes, activation="softmax", name="output"))

    model.compile(
        optimizer="adam",
        loss="sparse_categorical_crossentropy",
        metrics=[SparseF1Score(num_classes=num_classes, average="weighted")],
    )

    return model

In [None]:
# Test different block widths (2-4 Conv2D layers per block)
width_configs = [2, 3, 4]
width_results = []

print(f"Using best depth from Experiment 1: {best_depth} block(s)")

for convs_per_block in width_configs:
    print(f"Testing CNN with {convs_per_block} Conv2D layer(s) per block")
    print(
        f"Total blocks: {best_depth}, Total Conv2D layers: {best_depth * convs_per_block}"
    )

    # Build model
    model = build_width_model(best_depth, convs_per_block, len(class_names))

    # Print architecture
    print(f"\nArchitecture Summary:")
    model.summary()

    # Train and evaluate
    result = evaluate_model(
        model,
        X_train,
        X_val,
        y_train,
        y_val,
        f"CNN Width {convs_per_block}",
        epochs=30,
    )
    result["convs_per_block"] = convs_per_block
    result["num_blocks"] = best_depth
    width_results.append(result)

### Experiment 2: Results Summary

In [None]:
# Compare width experiment results in a table
width_summary = pd.DataFrame(
    {
        "Convs/Block": [r["convs_per_block"] for r in width_results],
        "Model": [r["model_name"] for r in width_results],
        "Val Accuracy": [f"{r['accuracy']:.4f}" for r in width_results],
        "Val Precision": [f"{r['precision']:.4f}" for r in width_results],
        "Val Recall": [f"{r['recall']:.4f}" for r in width_results],
        "Val F1-Score": [f"{r['f1']:.4f}" for r in width_results],
        "Train Time (s)": [f"{r['train_time']:.2f}" for r in width_results],
        "Predict Time (s)": [f"{r['predict_time']:.2f}" for r in width_results],
    }
)

print(f"Experiment 2: Block Width Results (using {best_depth} block(s))")
display(width_summary)

In [None]:
# Save Experiment 2 results to CSV
width_summary.to_csv("results/results_experiment2_width.csv", index=False)
print("\nResults saved to: results_experiment2_width.csv")

In [None]:
# Visualize width experiment results
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

width_convs = [r["convs_per_block"] for r in width_results]
width_f1 = [r["f1"] for r in width_results]
width_accuracy = [r["accuracy"] for r in width_results]
width_train_time = [r["train_time"] for r in width_results]

# F1-Score vs Width
axes[0, 0].plot(
    width_convs,
    width_f1,
    marker="o",
    linewidth=2,
    markersize=8,
)
axes[0, 0].set_xlabel("Conv2D Layers per Block", fontsize=12, fontweight="bold")
axes[0, 0].set_ylabel("F1-Score", fontsize=12, fontweight="bold")
axes[0, 0].set_title("F1-Score vs Block Width", fontsize=14, fontweight="bold")
axes[0, 0].grid(alpha=0.3)
axes[0, 0].set_xticks(width_convs)

# Accuracy vs Width
axes[0, 1].plot(
    width_convs,
    width_accuracy,
    marker="s",
    linewidth=2,
    markersize=8,
    color="green",
)
axes[0, 1].set_xlabel("Conv2D Layers per Block", fontsize=12, fontweight="bold")
axes[0, 1].set_ylabel("Accuracy", fontsize=12, fontweight="bold")
axes[0, 1].set_title("Accuracy vs Block Width", fontsize=14, fontweight="bold")
axes[0, 1].grid(alpha=0.3)
axes[0, 1].set_xticks(width_convs)

# Training Time vs Width
axes[1, 0].bar(
    width_convs,
    width_train_time,
    color="coral",
    edgecolor="black",
)
axes[1, 0].set_xlabel("Conv2D Layers per Block", fontsize=12, fontweight="bold")
axes[1, 0].set_ylabel("Training Time (s)", fontsize=12, fontweight="bold")
axes[1, 0].set_title("Training Time vs Block Width", fontsize=14, fontweight="bold")
axes[1, 0].grid(axis="y", alpha=0.3)
axes[1, 0].set_xticks(width_convs)

# All metrics comparison
x = np.arange(len(width_convs))
width_bar = 0.2
axes[1, 1].bar(x - width_bar, width_accuracy, width_bar, label="Accuracy")
axes[1, 1].bar(x, [r["precision"] for r in width_results], width_bar, label="Precision")
axes[1, 1].bar(
    x + width_bar, [r["recall"] for r in width_results], width_bar, label="Recall"
)
axes[1, 1].set_xlabel("Conv2D Layers per Block", fontsize=12, fontweight="bold")
axes[1, 1].set_ylabel("Score", fontsize=12, fontweight="bold")
axes[1, 1].set_title("Metrics Comparison by Width", fontsize=14, fontweight="bold")
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(width_convs)
axes[1, 1].legend()
axes[1, 1].grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Training curves for all width configurations
for result in width_results:
    print(f"\nTraining Curves for: {result['model_name']}")
    plot_training_curves(result["history"], result["model_name"])

# Find best width based on F1-Score
best_width_idx = np.argmax([r["f1"] for r in width_results])
best_width = width_results[best_width_idx]["convs_per_block"]
best_width_f1 = width_results[best_width_idx]["f1"]

In [None]:
print(f"Best block width       : {best_width} Conv2D layer(s) per block")
print(f"Best F1-Score          : {best_width_f1:.4f}")
print(f"This width will be used for Experiment 3 (Number of Filters).")

## Experiment 3: Number of Filters

In this experiment, we investigate the impact of the number of filters in convolutional layers.

Using the best depth and width from previous experiments, we test architectures with different base filter configurations: **16 and 32**. All configurations use:
- **ELU activation** function
- **He initialization** for weights  
- **Batch Normalization** before and after each convolutional layer
- **Early stopping** on validation F1 score

In [None]:
def build_filter_model(
    num_blocks, convs_per_block, num_classes, base_filters, input_shape=(64, 64, 3)
):
    """
    Build a CNN to test different filter configurations.
    Uses ELU activation, He initialization, and Batch Normalization.
    Doubles the number of filters after each MaxPooling layer.

    Args:
        num_blocks: Number of blocks (Conv2D layers + MaxPooling)
        convs_per_block: Number of Conv2D layers in each block before MaxPooling
        num_classes: Number of output classes
        base_filters: Number of filters in the first block
        input_shape: Shape of input images

    Returns:
        Compiled Keras model
    """
    model = keras.Sequential(name=f"filter_model_{base_filters}_filters")
    model.add(layers.Input(shape=input_shape))

    for block in range(num_blocks):
        # Calculate number of filters for this block (doubles after each pooling)
        current_filters = base_filters * (2**block)

        # Add multiple Conv2D layers per block
        for conv in range(convs_per_block):
            # Batch Normalization before Conv2D
            model.add(
                layers.BatchNormalization(name=f"block{block+1}_conv{conv+1}_bn_pre")
            )

            # Conv2D layer with ELU activation and He initialization
            model.add(
                layers.Conv2D(
                    current_filters,
                    (3, 3),
                    activation="elu",
                    padding="same",
                    kernel_initializer="he_normal",
                    name=f"block{block+1}_conv{conv+1}",
                )
            )

            # Batch Normalization after Conv2D
            model.add(
                layers.BatchNormalization(name=f"block{block+1}_conv{conv+1}_bn_post")
            )

        # MaxPooling layer after all convs in the block
        model.add(layers.MaxPooling2D((2, 2), name=f"block{block+1}_maxpool"))

    # Flatten and dense layers
    model.add(layers.Flatten(name="flatten"))
    model.add(
        layers.Dense(
            128, activation="elu", kernel_initializer="he_normal", name="dense_1"
        )
    )
    model.add(layers.BatchNormalization(name="dense_1_bn"))
    model.add(
        layers.Dense(
            128, activation="elu", kernel_initializer="he_normal", name="dense_2"
        )
    )
    model.add(layers.BatchNormalization(name="dense_2_bn"))
    model.add(layers.Dense(num_classes, activation="softmax", name="output"))

    model.compile(
        optimizer="adam",
        loss="sparse_categorical_crossentropy",
        metrics=[SparseF1Score(num_classes=num_classes, average="weighted")],
    )

    return model

In [None]:
# Test different filter configurations
filter_configs = [16, 32]
filter_results = []

print(f"Using best depth from Experiment 1: {best_depth} block(s)")
print(f"Using best width from Experiment 2: {best_width} Conv2D layer(s) per block")

for base_filters in filter_configs:
    print(f"Testing CNN with base filters: {base_filters}")

    # Build model
    model = build_filter_model(best_depth, best_width, len(class_names), base_filters)

    # Print architecture
    print(f"\nArchitecture Summary:")
    model.summary()

    # Train and evaluate
    result = evaluate_model(
        model,
        X_train,
        X_val,
        y_train,
        y_val,
        f"CNN Filters {base_filters}",
        epochs=30,
    )
    result["base_filters"] = base_filters
    result["num_blocks"] = best_depth
    result["convs_per_block"] = best_width
    filter_results.append(result)

### Experiment 3: Results Summary

In [None]:
# Compare filter experiment results in a table
filter_summary = pd.DataFrame(
    {
        "Base Filters": [r["base_filters"] for r in filter_results],
        "Model": [r["model_name"] for r in filter_results],
        "Val Accuracy": [f"{r['accuracy']:.4f}" for r in filter_results],
        "Val Precision": [f"{r['precision']:.4f}" for r in filter_results],
        "Val Recall": [f"{r['recall']:.4f}" for r in filter_results],
        "Val F1-Score": [f"{r['f1']:.4f}" for r in filter_results],
        "Train Time (s)": [f"{r['train_time']:.2f}" for r in filter_results],
        "Predict Time (s)": [f"{r['predict_time']:.2f}" for r in filter_results],
    }
)

print(f"Experiment 3: Number of Filters Results")
display(filter_summary)

In [None]:
# Save Experiment 3 results to CSV
filter_summary.to_csv("results/results_experiment3_filters.csv", index=False)
print("\nResults saved to: results_experiment3_filters.csv")

In [None]:
# Visualize filter experiment results
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

filter_bases = [r["base_filters"] for r in filter_results]

# F1-Score comparison
axes[0, 0].plot(
    filter_bases,
    [r["f1"] for r in filter_results],
    marker="o",
    linewidth=2,
    markersize=8,
)
axes[0, 0].set_xlabel("Base Filters", fontsize=12)
axes[0, 0].set_ylabel("F1-Score", fontsize=12)
axes[0, 0].set_title("F1-Score vs Number of Filters", fontsize=14, fontweight="bold")
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].set_xticks(filter_bases)

# Accuracy comparison
axes[0, 1].plot(
    filter_bases,
    [r["accuracy"] for r in filter_results],
    marker="s",
    linewidth=2,
    markersize=8,
    color="green",
)
axes[0, 1].set_xlabel("Base Filters", fontsize=12)
axes[0, 1].set_ylabel("Accuracy", fontsize=12)
axes[0, 1].set_title("Accuracy vs Number of Filters", fontsize=14, fontweight="bold")
axes[0, 1].grid(True, alpha=0.3)
axes[0, 1].set_xticks(filter_bases)

# Training time comparison
axes[1, 0].bar(
    range(len(filter_bases)), [r["train_time"] for r in filter_results], color="coral"
)
axes[1, 0].set_xlabel("Base Filters", fontsize=12)
axes[1, 0].set_ylabel("Training Time (s)", fontsize=12)
axes[1, 0].set_title(
    "Training Time vs Number of Filters", fontsize=14, fontweight="bold"
)
axes[1, 0].set_xticks(range(len(filter_bases)))
axes[1, 0].set_xticklabels(filter_bases)
axes[1, 0].grid(True, alpha=0.3, axis="y")

# All metrics comparison
metrics_data = {
    "Precision": [r["precision"] for r in filter_results],
    "Recall": [r["recall"] for r in filter_results],
    "F1-Score": [r["f1"] for r in filter_results],
}
x = np.arange(len(filter_bases))
width = 0.25

for i, (metric_name, metric_values) in enumerate(metrics_data.items()):
    axes[1, 1].bar(x + i * width, metric_values, width, label=metric_name)

axes[1, 1].set_xlabel("Base Filters", fontsize=12)
axes[1, 1].set_ylabel("Score", fontsize=12)
axes[1, 1].set_title("All Metrics vs Number of Filters", fontsize=14, fontweight="bold")
axes[1, 1].set_xticks(x + width)
axes[1, 1].set_xticklabels(filter_bases)
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3, axis="y")

plt.tight_layout()
plt.show()

In [None]:
# Training curves for all filters configurations
for result in filter_results:
    print(f"\nTraining Curves for: {result['model_name']}")
    plot_training_curves(result["history"], result["model_name"])

# Find best filters based on F1-Score
best_filter_idx = np.argmax([r["f1"] for r in filter_results])
best_filter = filter_results[best_filter_idx]["base_filters"]
best_filter_f1 = filter_results[best_filter_idx]["f1"]

In [None]:
print(f"Best base filters      : {best_filter}")
print(f"Best F1-Score          : {best_filter_f1:.4f}")
print(f"These base filters will be used for Experiment 4 (Dropout).")

## Experiment 4: Dropout Regularization

In this experiment, we investigate the impact of dropout regularization on model performance.

Using the optimal architecture from previous experiments, we test different dropout rates: **0.0 (no dropout), 0.2, 0.4, and 0.6**. Dropout is applied after each MaxPooling layer and after the Flatten layer.

In [None]:
def build_dropout_model(
    num_blocks,
    convs_per_block,
    num_classes,
    dropout_rate=0.3,
    input_shape=(64, 64, 3),
    base_filters=32,
):
    """
    Build a CNN with dropout regularization.
    Doubles the number of filters after each MaxPooling layer.

    Args:
        num_blocks: Number of blocks
        convs_per_block: Number of Conv2D layers per block
        num_classes: Number of output classes
        dropout_rate: Dropout rate (0.0 = no dropout)
        input_shape: Shape of input images
        base_filters: Base number of filters in the first block

    Returns:
        Compiled Keras model
    """
    model = keras.Sequential(name=f"dropout_model_dr{dropout_rate}")
    model.add(layers.Input(shape=input_shape))

    for block in range(num_blocks):
        # Calculate number of filters for this block (doubles after each pooling)
        current_filters = base_filters * (2**block)

        for conv in range(convs_per_block):
            model.add(
                layers.BatchNormalization(name=f"block{block+1}_conv{conv+1}_bn_pre")
            )
            model.add(
                layers.Conv2D(
                    current_filters,
                    (3, 3),
                    activation="elu",
                    padding="same",
                    kernel_initializer="he_normal",
                    name=f"block{block+1}_conv{conv+1}",
                )
            )
            model.add(
                layers.BatchNormalization(name=f"block{block+1}_conv{conv+1}_bn_post")
            )

        model.add(layers.MaxPooling2D((2, 2), name=f"block{block+1}_maxpool"))

        # Dropout after MaxPooling
        if dropout_rate > 0:
            model.add(layers.Dropout(dropout_rate, name=f"block{block+1}_dropout"))

    model.add(layers.Flatten(name="flatten"))

    # Dropout after Flatten
    if dropout_rate > 0:
        model.add(layers.Dropout(dropout_rate, name="flatten_dropout"))

    model.add(
        layers.Dense(
            128, activation="elu", kernel_initializer="he_normal", name="dense_1"
        )
    )
    model.add(layers.BatchNormalization(name="dense_1_bn"))
    model.add(
        layers.Dense(
            128, activation="elu", kernel_initializer="he_normal", name="dense_2"
        )
    )
    model.add(layers.BatchNormalization(name="dense_2_bn"))
    model.add(layers.Dense(num_classes, activation="softmax", name="output"))

    model.compile(
        optimizer="adam",
        loss="sparse_categorical_crossentropy",
        metrics=[SparseF1Score(num_classes=num_classes, average="weighted")],
    )

    return model

In [None]:
# Test different dropout rates
dropout_configs = [0.0, 0.2, 0.4, 0.6]
dropout_results = []

print(
    f"Using optimal architecture: {best_depth} blocks, {best_width} convs/block, {best_filter} base filters"
)

for dropout_rate in dropout_configs:
    print(f"Testing CNN with dropout rate: {dropout_rate}")

    # Build model
    model = build_dropout_model(
        best_depth,
        best_width,
        len(class_names),
        dropout_rate=dropout_rate,
        base_filters=best_filter,
    )

    # Print architecture
    print(f"\nArchitecture Summary:")
    model.summary()

    # Train and evaluate
    result = evaluate_model(
        model,
        X_train,
        X_val,
        y_train,
        y_val,
        f"CNN Dropout {dropout_rate}",
        epochs=30,
    )
    result["dropout_rate"] = dropout_rate
    dropout_results.append(result)

### Experiment 4: Results Summary

In [None]:
# Compare dropout experiment results
dropout_summary = pd.DataFrame(
    {
        "Dropout Rate": [r["dropout_rate"] for r in dropout_results],
        "Model": [r["model_name"] for r in dropout_results],
        "Val Accuracy": [f"{r['accuracy']:.4f}" for r in dropout_results],
        "Val Precision": [f"{r['precision']:.4f}" for r in dropout_results],
        "Val Recall": [f"{r['recall']:.4f}" for r in dropout_results],
        "Val F1-Score": [f"{r['f1']:.4f}" for r in dropout_results],
        "Train Time (s)": [f"{r['train_time']:.2f}" for r in dropout_results],
        "Predict Time (s)": [f"{r['predict_time']:.2f}" for r in dropout_results],
    }
)

print(f"Experiment 4: Dropout Regularization Results")
display(dropout_summary)

In [None]:
# Save Experiment 4 results to CSV
dropout_summary.to_csv("results/results_experiment4_dropout.csv", index=False)
print("\nResults saved to: results_experiment4_dropout.csv")

In [None]:
# Visualize dropout experiment results
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

dropout_rates = [r["dropout_rate"] for r in dropout_results]
dropout_f1 = [r["f1"] for r in dropout_results]
dropout_accuracy = [r["accuracy"] for r in dropout_results]
dropout_train_time = [r["train_time"] for r in dropout_results]

# F1-Score vs Dropout
axes[0, 0].plot(dropout_rates, dropout_f1, marker="o", linewidth=2, markersize=8)
axes[0, 0].set_xlabel("Dropout Rate", fontsize=12, fontweight="bold")
axes[0, 0].set_ylabel("F1-Score", fontsize=12, fontweight="bold")
axes[0, 0].set_title("F1-Score vs Dropout Rate", fontsize=14, fontweight="bold")
axes[0, 0].grid(alpha=0.3)
axes[0, 0].set_xticks(dropout_rates)

# Accuracy vs Dropout
axes[0, 1].plot(
    dropout_rates,
    dropout_accuracy,
    marker="s",
    linewidth=2,
    markersize=8,
    color="green",
)
axes[0, 1].set_xlabel("Dropout Rate", fontsize=12, fontweight="bold")
axes[0, 1].set_ylabel("Accuracy", fontsize=12, fontweight="bold")
axes[0, 1].set_title("Accuracy vs Dropout Rate", fontsize=14, fontweight="bold")
axes[0, 1].grid(alpha=0.3)
axes[0, 1].set_xticks(dropout_rates)

# Training Time vs Dropout
x_pos = np.arange(len(dropout_rates))
axes[1, 0].bar(
    x_pos,
    dropout_train_time,
    color="coral",
    edgecolor="black",
)
axes[1, 0].set_xlabel("Dropout Rate", fontsize=12, fontweight="bold")
axes[1, 0].set_ylabel("Training Time (s)", fontsize=12, fontweight="bold")
axes[1, 0].set_title("Training Time vs Dropout Rate", fontsize=14, fontweight="bold")
axes[1, 0].grid(axis="y", alpha=0.3)
axes[1, 0].set_xticks(x_pos)
axes[1, 0].set_xticklabels(dropout_rates)

# All metrics comparison
x = np.arange(len(dropout_rates))
width = 0.2
axes[1, 1].bar(x - width, dropout_accuracy, width, label="Accuracy")
axes[1, 1].bar(x, [r["precision"] for r in dropout_results], width, label="Precision")
axes[1, 1].bar(x + width, [r["recall"] for r in dropout_results], width, label="Recall")
axes[1, 1].set_xlabel("Dropout Rate", fontsize=12, fontweight="bold")
axes[1, 1].set_ylabel("Score", fontsize=12, fontweight="bold")
axes[1, 1].set_title("Metrics Comparison by Dropout", fontsize=14, fontweight="bold")
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(dropout_rates)
axes[1, 1].legend()
axes[1, 1].grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Training curves for all dropout configurations
for result in dropout_results:
    print(f"\nTraining Curves for: {result['model_name']}")
    plot_training_curves(result["history"], result["model_name"])

# Find best dropout based on F1-Score
best_dropout_idx = np.argmax([r["f1"] for r in dropout_results])
best_dropout = dropout_results[best_dropout_idx]["dropout_rate"]
best_dropout_f1 = dropout_results[best_dropout_idx]["f1"]

In [None]:
print(f"\nBest dropout rate      : {best_dropout}")
print(f"Best F1-Score          : {best_dropout_f1:.4f}")
print(f"This dropout rate will be used for Experiment 5 (L2 Regularization).")

## Experiment 5: L2 Regularization

In this experiment, we investigate the impact of L2 regularization on the convolutional and dense layers.

Using the optimal architecture and best dropout rate, we test different L2 regularization strengths: **0.0 (no L2), 0.0001, 0.001, 0.01**.

In [None]:
def build_l2_model(
    num_blocks,
    convs_per_block,
    num_classes,
    dropout_rate=0.3,
    l2_strength=0.001,
    input_shape=(64, 64, 3),
    base_filters=32,
):
    """
    Build a CNN with L2 regularization.
    Doubles the number of filters after each MaxPooling layer.

    Args:
        num_blocks: Number of blocks
        convs_per_block: Number of Conv2D layers per block
        num_classes: Number of output classes
        dropout_rate: Dropout rate
        l2_strength: L2 regularization strength (0.0 = no L2)
        input_shape: Shape of input images
        base_filters: Base number of filters in the first block

    Returns:
        Compiled Keras model
    """

    model = keras.Sequential(name=f"l2_model_l2{l2_strength}")
    model.add(layers.Input(shape=input_shape))

    for block in range(num_blocks):
        # Calculate number of filters for this block (doubles after each pooling)
        current_filters = base_filters * (2**block)

        for conv in range(convs_per_block):
            model.add(
                layers.BatchNormalization(name=f"block{block+1}_conv{conv+1}_bn_pre")
            )

            # Add L2 regularization to Conv2D
            if l2_strength > 0:
                model.add(
                    layers.Conv2D(
                        current_filters,
                        (3, 3),
                        activation="elu",
                        padding="same",
                        kernel_initializer="he_normal",
                        kernel_regularizer=regularizers.l2(l2_strength),
                        name=f"block{block+1}_conv{conv+1}",
                    )
                )
            else:
                model.add(
                    layers.Conv2D(
                        current_filters,
                        (3, 3),
                        activation="elu",
                        padding="same",
                        kernel_initializer="he_normal",
                        name=f"block{block+1}_conv{conv+1}",
                    )
                )

            model.add(
                layers.BatchNormalization(name=f"block{block+1}_conv{conv+1}_bn_post")
            )

        model.add(layers.MaxPooling2D((2, 2), name=f"block{block+1}_maxpool"))

        if dropout_rate > 0:
            model.add(layers.Dropout(dropout_rate, name=f"block{block+1}_dropout"))

    model.add(layers.Flatten(name="flatten"))

    if dropout_rate > 0:
        model.add(layers.Dropout(dropout_rate, name="flatten_dropout"))

    # Add L2 regularization to Dense layers
    if l2_strength > 0:
        model.add(
            layers.Dense(
                128,
                activation="elu",
                kernel_initializer="he_normal",
                kernel_regularizer=regularizers.l2(l2_strength),
                name="dense_1",
            )
        )
    else:
        model.add(
            layers.Dense(
                128, activation="elu", kernel_initializer="he_normal", name="dense_1"
            )
        )

    model.add(layers.BatchNormalization(name="dense_1_bn"))

    if l2_strength > 0:
        model.add(
            layers.Dense(
                128,
                activation="elu",
                kernel_initializer="he_normal",
                kernel_regularizer=regularizers.l2(l2_strength),
                name="dense_2",
            )
        )
    else:
        model.add(
            layers.Dense(
                128, activation="elu", kernel_initializer="he_normal", name="dense_2"
            )
        )

    model.add(layers.BatchNormalization(name="dense_2_bn"))
    model.add(layers.Dense(num_classes, activation="softmax", name="output"))

    model.compile(
        optimizer="adam",
        loss="sparse_categorical_crossentropy",
        metrics=[SparseF1Score(num_classes=num_classes, average="weighted")],
    )

    return model

In [None]:
# Test different L2 regularization strengths
l2_configs = [0.0, 0.0001, 0.001, 0.01]
l2_results = []

print(
    f"Using optimal architecture: {best_depth} blocks, {best_width} convs/block, {best_filter} base filters"
)
print(f"Using best dropout rate: {best_dropout}")

for l2_strength in l2_configs:
    print(f"Testing CNN with L2 regularization: {l2_strength}")

    # Build model
    model = build_l2_model(
        best_depth,
        best_width,
        len(class_names),
        dropout_rate=best_dropout,
        l2_strength=l2_strength,
        base_filters=best_filter,
    )

    # Print architecture
    print(f"\nArchitecture Summary:")
    model.summary()

    # Train and evaluate
    result = evaluate_model(
        model,
        X_train,
        X_val,
        y_train,
        y_val,
        f"CNN L2 {l2_strength}",
        epochs=30,
    )
    result["l2_strength"] = l2_strength
    l2_results.append(result)

### Experiment 5: Results Summary

In [None]:
# Compare L2 experiment results
l2_summary = pd.DataFrame(
    {
        "L2 Strength": [r["l2_strength"] for r in l2_results],
        "Model": [r["model_name"] for r in l2_results],
        "Val Accuracy": [f"{r['accuracy']:.4f}" for r in l2_results],
        "Val Precision": [f"{r['precision']:.4f}" for r in l2_results],
        "Val Recall": [f"{r['recall']:.4f}" for r in l2_results],
        "Val F1-Score": [f"{r['f1']:.4f}" for r in l2_results],
        "Train Time (s)": [f"{r['train_time']:.2f}" for r in l2_results],
        "Predict Time (s)": [f"{r['predict_time']:.2f}" for r in l2_results],
    }
)

print(f"Experiment 5: L2 Regularization Results")
display(l2_summary)

In [None]:
# Save Experiment 5 results to CSV
l2_summary.to_csv("results/results_experiment5_l2.csv", index=False)
print("\nResults saved to: results_experiment5_l2.csv")

In [None]:
# Visualize L2 experiment results
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

l2_strengths = [r["l2_strength"] for r in l2_results]
l2_f1 = [r["f1"] for r in l2_results]
l2_accuracy = [r["accuracy"] for r in l2_results]
l2_train_time = [r["train_time"] for r in l2_results]

# F1-Score vs L2
axes[0, 0].plot(l2_strengths, l2_f1, marker="o", linewidth=2, markersize=8)
axes[0, 0].set_xlabel("L2 Regularization Strength", fontsize=12, fontweight="bold")
axes[0, 0].set_ylabel("F1-Score", fontsize=12, fontweight="bold")
axes[0, 0].set_title("F1-Score vs L2 Strength", fontsize=14, fontweight="bold")
axes[0, 0].grid(alpha=0.3)
axes[0, 0].set_xscale("log")

# Accuracy vs L2
axes[0, 1].plot(
    l2_strengths,
    l2_accuracy,
    marker="s",
    linewidth=2,
    markersize=8,
    color="green",
)
axes[0, 1].set_xlabel("L2 Regularization Strength", fontsize=12, fontweight="bold")
axes[0, 1].set_ylabel("Accuracy", fontsize=12, fontweight="bold")
axes[0, 1].set_title("Accuracy vs L2 Strength", fontsize=14, fontweight="bold")
axes[0, 1].grid(alpha=0.3)
axes[0, 1].set_xscale("log")

# Training Time vs L2
x_pos = np.arange(len(l2_strengths))
axes[1, 0].bar(
    x_pos,
    l2_train_time,
    color="coral",
    edgecolor="black",
)
axes[1, 0].set_xlabel("L2 Regularization Strength", fontsize=12, fontweight="bold")
axes[1, 0].set_ylabel("Training Time (s)", fontsize=12, fontweight="bold")
axes[1, 0].set_title("Training Time vs L2 Strength", fontsize=14, fontweight="bold")
axes[1, 0].grid(axis="y", alpha=0.3)
axes[1, 0].set_xticks(x_pos)
axes[1, 0].set_xticklabels(l2_strengths)

# All metrics comparison
x = np.arange(len(l2_strengths))
width = 0.2
axes[1, 1].bar(x - width, l2_accuracy, width, label="Accuracy")
axes[1, 1].bar(x, [r["precision"] for r in l2_results], width, label="Precision")
axes[1, 1].bar(x + width, [r["recall"] for r in l2_results], width, label="Recall")
axes[1, 1].set_xlabel("L2 Regularization Strength", fontsize=12, fontweight="bold")
axes[1, 1].set_ylabel("Score", fontsize=12, fontweight="bold")
axes[1, 1].set_title("Metrics Comparison by L2", fontsize=14, fontweight="bold")
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(l2_strengths)
axes[1, 1].legend()
axes[1, 1].grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Training curves for all L2 configurations
for result in l2_results:
    print(f"\nTraining Curves for: {result['model_name']}")
    plot_training_curves(result["history"], result["model_name"])

# Find best L2 based on F1-Score
best_l2_idx = np.argmax([r["f1"] for r in l2_results])
best_l2 = l2_results[best_l2_idx]["l2_strength"]
best_l2_f1 = l2_results[best_l2_idx]["f1"]

In [None]:
print(f"\nBest L2 strength       : {best_l2}")
print(f"Best F1-Score          : {best_l2_f1:.4f}")
print(f"This L2 strength will be used for the final model.")

## Experiment 6: Data Augmentation with Class Balancing

In this experiment, we apply data augmentation to balance all classes to 3000 samples each and investigate the impact on model performance.

We use random transformations (rotation, flip, zoom, shift) to augment smaller classes using TensorFlow.

In [None]:
def augment_image(image):
    """Apply random augmentation to an image using TensorFlow."""
    # Convert numpy array to tensor if needed
    img = tf.convert_to_tensor(image, dtype=tf.float32)

    # Random horizontal flip
    img = tf.image.random_flip_left_right(img)

    # Random vertical flip
    img = tf.image.random_flip_up_down(img)

    # Random brightness adjustment
    img = tf.image.random_brightness(img, max_delta=0.1)

    # Random contrast adjustment
    img = tf.image.random_contrast(img, lower=0.9, upper=1.1)

    # Random zoom and translation using crop and resize
    height, width = img.shape[0], img.shape[1]
    # Random crop size between 90% and 100% of original
    crop_factor = tf.random.uniform([], 0.9, 1.0)
    new_height = tf.cast(tf.cast(height, tf.float32) * crop_factor, tf.int32)
    new_width = tf.cast(tf.cast(width, tf.float32) * crop_factor, tf.int32)
    img = tf.image.random_crop(img, [new_height, new_width, 3])
    img = tf.image.resize(img, [height, width])

    # Random rotation using TensorFlow operations (approximation using flip and transpose)
    # For more complex rotations, we can use simple 90-degree rotations
    k = tf.random.uniform([], 0, 4, dtype=tf.int32)
    img = tf.image.rot90(img, k=k)
    # Ensure output shape is correct
    img = tf.image.resize(img, [height, width])

    # Clip values to [0, 1]
    img = tf.clip_by_value(img, 0.0, 1.0)

    return img.numpy()


def balance_dataset(X, y, target_samples=3000):
    """Balance dataset by augmenting minority classes to target_samples."""
    print(f"\nBalancing dataset to {target_samples} samples per class...")

    # Check current class distribution
    unique_classes, class_counts = np.unique(y, return_counts=True)
    print("\nOriginal class distribution:")
    for cls, count in zip(unique_classes, class_counts):
        print(f"  Class {cls} ({class_names[cls]:20s}): {count:5d} samples")

    X_balanced = []
    y_balanced = []

    for cls in unique_classes:
        # Get samples for this class
        class_indices = np.where(y == cls)[0]
        class_samples = X[class_indices]
        current_count = len(class_samples)

        # Add original samples
        X_balanced.extend(class_samples)
        y_balanced.extend([cls] * current_count)

        # Augment if needed
        if current_count < target_samples:
            needed = target_samples - current_count
            print(
                f"\nAugmenting class {cls} ({class_names[cls]}): adding {needed} samples"
            )

            for _ in range(needed):
                # Randomly select an image from this class
                idx = np.random.randint(0, current_count)
                original_img = class_samples[idx]

                # Augment it
                augmented_img = augment_image(original_img)

                X_balanced.append(augmented_img)
                y_balanced.append(cls)
        elif current_count > target_samples:
            # Optionally downsample
            print(
                f"\nClass {cls} ({class_names[cls]}) has {current_count} samples (no downsampling)"
            )

    X_balanced = np.array(X_balanced)
    y_balanced = np.array(y_balanced)

    # Check new distribution
    unique_classes, class_counts = np.unique(y_balanced, return_counts=True)
    print("Balanced class distribution:")
    for cls, count in zip(unique_classes, class_counts):
        print(f"  Class {cls} ({class_names[cls]:20s}): {count:5d} samples")

    print(f"\nTotal samples: {len(X_balanced)}")

    return X_balanced, y_balanced

In [None]:
# Balance the training set
X_train_balanced, y_train_balanced = balance_dataset(
    X_train, y_train, target_samples=3000
)

# Shuffle the balanced dataset
indices = np.random.permutation(len(X_train_balanced))
X_train_balanced = X_train_balanced[indices]
y_train_balanced = y_train_balanced[indices]

print(f"\nBalanced training set shape: {X_train_balanced.shape}")
print(f"Balanced labels shape: {y_train_balanced.shape}")

In [None]:
# Train model with balanced dataset
print("Training CNN with Balanced Dataset + Data Augmentation")

model_augmented = build_l2_model(
    best_depth,
    best_width,
    len(class_names),
    dropout_rate=best_dropout,
    l2_strength=best_l2,
    base_filters=best_filter,
)

print(f"\nArchitecture Summary:")
model_augmented.summary()

# Train and evaluate
result_augmented = evaluate_model(
    model_augmented,
    X_train_balanced,
    X_val,
    y_train_balanced,
    y_val,
    "CNN with Data Augmentation",
    epochs=30,
)

print("Data Augmentation Results:")
print(f"  F1-Score (validation)  : {result_augmented['f1']:.4f}")
print(f"  Accuracy (validation)  : {result_augmented['accuracy']:.4f}")

### Experiment 6: Results Summary

In [None]:
# Compare augmented vs best L2 model (baseline without augmentation)
augmentation_results = [l2_results[best_l2_idx], result_augmented]

augmentation_summary = pd.DataFrame(
    {
        "Model": [r["model_name"] for r in augmentation_results],
        "Val Accuracy": [f"{r['accuracy']:.4f}" for r in augmentation_results],
        "Val Precision": [f"{r['precision']:.4f}" for r in augmentation_results],
        "Val Recall": [f"{r['recall']:.4f}" for r in augmentation_results],
        "Val F1-Score": [f"{r['f1']:.4f}" for r in augmentation_results],
        "Train Time (s)": [f"{r['train_time']:.2f}" for r in augmentation_results],
        "Predict Time (s)": [f"{r['predict_time']:.2f}" for r in augmentation_results],
    }
)

print("Experiment 6: Data Augmentation vs Baseline")
display(augmentation_summary)

In [None]:
# Save Experiment 6 results to CSV
augmentation_summary.to_csv("results/results_experiment6_augmentation.csv", index=False)
print("\nResults saved to: results_experiment6_augmentation.csv")

In [None]:
# Calculate improvement
f1_improvement = (result_augmented["f1"] - l2_results[best_l2_idx]["f1"]) * 100
accuracy_improvement = (
    result_augmented["accuracy"] - l2_results[best_l2_idx]["accuracy"]
) * 100

print(f"\nImprovement with Data Augmentation:")
print(f"  F1-Score improvement   : {f1_improvement:+.2f}%")
print(f"  Accuracy improvement   : {accuracy_improvement:+.2f}%")

In [None]:
# Visualize comparison between baseline and augmented model
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

models = ["Baseline (Best L2)", "With Data Augmentation"]
metrics_names = ["Accuracy", "Precision", "Recall", "F1-Score"]

baseline_metrics = [
    l2_results[best_l2_idx]["accuracy"],
    l2_results[best_l2_idx]["precision"],
    l2_results[best_l2_idx]["recall"],
    l2_results[best_l2_idx]["f1"],
]

augmented_metrics = [
    result_augmented["accuracy"],
    result_augmented["precision"],
    result_augmented["recall"],
    result_augmented["f1"],
]

# Metrics comparison
x = np.arange(len(metrics_names))
width = 0.35
axes[0].bar(x - width / 2, baseline_metrics, width, label="Baseline", alpha=0.8)
axes[0].bar(
    x + width / 2, augmented_metrics, width, label="With Augmentation", alpha=0.8
)
axes[0].set_xlabel("Metrics", fontsize=12, fontweight="bold")
axes[0].set_ylabel("Score", fontsize=12, fontweight="bold")
axes[0].set_title(
    "Baseline vs Data Augmentation - All Metrics", fontsize=14, fontweight="bold"
)
axes[0].set_xticks(x)
axes[0].set_xticklabels(metrics_names)
axes[0].legend()
axes[0].grid(axis="y", alpha=0.3)
axes[0].set_ylim([0.8, 1.0])

# Training time comparison
train_times = [l2_results[best_l2_idx]["train_time"], result_augmented["train_time"]]
axes[1].bar(
    models, train_times, color=["coral", "steelblue"], edgecolor="black", alpha=0.8
)
axes[1].set_ylabel("Training Time (s)", fontsize=12, fontweight="bold")
axes[1].set_title("Training Time Comparison", fontsize=14, fontweight="bold")
axes[1].grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Training curves comparison
print("\nTraining Curves - Baseline (Best L2 Model):")
plot_training_curves(
    l2_results[best_l2_idx]["history"], l2_results[best_l2_idx]["model_name"]
)

print("\nTraining Curves - With Data Augmentation:")
plot_training_curves(result_augmented["history"], result_augmented["model_name"])

## Experiment 7: Batch Size

In this experiment, we investigate the impact of different batch sizes on model performance.

Using the optimal architecture from previous experiments, we test different batch sizes: **16, 32, 64, and 128**. We use the best hyperparameters found in previous experiments.

In [None]:
def build_batch_size_model(
    num_blocks,
    convs_per_block,
    num_classes,
    dropout_rate,
    l2_strength,
    base_filters,
    batch_size,
):
    """
    Build a CNN for batch size experiments.
    Doubles the number of filters after each MaxPooling layer.

    Args:
        num_blocks: Number of blocks
        convs_per_block: Number of Conv2D layers per block
        num_classes: Number of output classes
        dropout_rate: Dropout rate
        l2_strength: L2 regularization strength
        base_filters: Base number of filters in the first block
        batch_size: Batch size (used for model naming)

    Returns:
        Keras model (uncompiled)
    """
    model = keras.Sequential(name=f"batch_model_{batch_size}")
    model.add(keras.layers.Input(shape=(64, 64, 3)))

    for block in range(num_blocks):
        # Calculate number of filters for this block (doubles after each pooling)
        current_filters = base_filters * (2**block)

        for conv in range(convs_per_block):
            # Batch Normalization before convolution
            model.add(
                keras.layers.BatchNormalization(
                    name=f"block{block+1}_conv{conv+1}_bn_pre"
                )
            )
            # Convolutional layer
            model.add(
                keras.layers.Conv2D(
                    current_filters,
                    (3, 3),
                    activation="elu",
                    padding="same",
                    kernel_initializer="he_normal",
                    kernel_regularizer=keras.regularizers.l2(l2_strength),
                    name=f"block{block+1}_conv{conv+1}",
                )
            )
            # Batch Normalization after convolution
            model.add(
                keras.layers.BatchNormalization(
                    name=f"block{block+1}_conv{conv+1}_bn_post"
                )
            )

        # MaxPooling after each block
        model.add(
            keras.layers.MaxPooling2D(pool_size=(2, 2), name=f"block{block+1}_maxpool")
        )
        # Dropout after each block
        model.add(keras.layers.Dropout(dropout_rate, name=f"block{block+1}_dropout"))

    # Flatten and Dense layers
    model.add(keras.layers.Flatten(name="flatten"))
    model.add(
        keras.layers.Dense(
            128,
            activation="elu",
            kernel_regularizer=keras.regularizers.l2(l2_strength),
            name="dense_1",
        )
    )
    model.add(keras.layers.BatchNormalization(name="dense_1_bn"))
    model.add(
        keras.layers.Dense(
            128,
            activation="elu",
            kernel_regularizer=keras.regularizers.l2(l2_strength),
            name="dense_2",
        )
    )
    model.add(keras.layers.Dropout(dropout_rate, name="dense_dropout"))
    model.add(keras.layers.Dense(num_classes, activation="softmax", name="output"))

    return model

In [None]:
# Test different batch sizes
batch_size_configs = [16, 32, 64, 128]
batch_size_results = []

print(
    f"Using optimal architecture: {best_depth} blocks, {best_width} convs/block, {best_filter} base filters"
)
print(f"Using best dropout: {best_dropout}, best L2: {best_l2}")
print("=" * 60)

for batch_size in batch_size_configs:
    print(f"\nTesting CNN with batch size: {batch_size}")

    # Build model with proper naming
    model = build_batch_size_model(
        best_depth,
        best_width,
        len(class_names),
        dropout_rate=best_dropout,
        l2_strength=best_l2,
        base_filters=best_filter,
        batch_size=batch_size,
    )

    # Evaluate model with specific batch size
    result = evaluate_model(
        model,
        X_train,
        y_train,
        X_val,
        y_val,
        model_name=f"CNN Batch {batch_size}",
        batch_size=batch_size,
    )

    # Add batch_size to result dictionary
    result["batch_size"] = batch_size
    batch_size_results.append(result)

### Experiment 7: Results Summary

In [None]:
# Compare batch size experiment results in a table
batch_size_summary = pd.DataFrame(
    {
        "Batch Size": [r["batch_size"] for r in batch_size_results],
        "Model": [r["model_name"] for r in batch_size_results],
        "Val Accuracy": [f"{r['accuracy']:.4f}" for r in batch_size_results],
        "Val Precision": [f"{r['precision']:.4f}" for r in batch_size_results],
        "Val Recall": [f"{r['recall']:.4f}" for r in batch_size_results],
        "Val F1-Score": [f"{r['f1']:.4f}" for r in batch_size_results],
        "Train Time (s)": [f"{r['train_time']:.2f}" for r in batch_size_results],
        "Predict Time (s)": [f"{r['predict_time']:.2f}" for r in batch_size_results],
    }
)

print("Experiment 7: Batch Size Optimization Results")
display(batch_size_summary)

In [None]:
# Save Experiment 7 results to CSV
batch_size_summary.to_csv("results/results_experiment7_batch_size.csv", index=False)
print("\nResults saved to: results_experiment7_batch_size.csv")

In [None]:
# Visualize batch size experiment results
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

batch_sizes = [r["batch_size"] for r in batch_size_results]
batch_f1 = [r["f1"] for r in batch_size_results]
batch_accuracy = [r["accuracy"] for r in batch_size_results]
batch_train_time = [r["train_time"] for r in batch_size_results]

# F1-Score vs Batch Size
axes[0, 0].plot(
    batch_sizes,
    batch_f1,
    marker="o",
    linewidth=2,
    markersize=8,
)
axes[0, 0].set_xlabel("Batch Size", fontsize=12, fontweight="bold")
axes[0, 0].set_ylabel("F1-Score", fontsize=12, fontweight="bold")
axes[0, 0].set_title("F1-Score vs Batch Size", fontsize=14, fontweight="bold")
axes[0, 0].grid(alpha=0.3)
axes[0, 0].set_xscale("log", base=2)
axes[0, 0].set_xticks(batch_sizes)
axes[0, 0].set_xticklabels(batch_sizes)

# Accuracy vs Batch Size
axes[0, 1].plot(
    batch_sizes,
    batch_accuracy,
    marker="s",
    linewidth=2,
    markersize=8,
    color="green",
)
axes[0, 1].set_xlabel("Batch Size", fontsize=12, fontweight="bold")
axes[0, 1].set_ylabel("Accuracy", fontsize=12, fontweight="bold")
axes[0, 1].set_title("Accuracy vs Batch Size", fontsize=14, fontweight="bold")
axes[0, 1].grid(alpha=0.3)
axes[0, 1].set_xscale("log", base=2)
axes[0, 1].set_xticks(batch_sizes)
axes[0, 1].set_xticklabels(batch_sizes)

# Training Time vs Batch Size
x_pos = np.arange(len(batch_sizes))
axes[1, 0].bar(
    x_pos,
    batch_train_time,
    color="coral",
    edgecolor="black",
)
axes[1, 0].set_xlabel("Batch Size", fontsize=12, fontweight="bold")
axes[1, 0].set_ylabel("Training Time (s)", fontsize=12, fontweight="bold")
axes[1, 0].set_title("Training Time vs Batch Size", fontsize=14, fontweight="bold")
axes[1, 0].grid(axis="y", alpha=0.3)
axes[1, 0].set_xticks(x_pos)
axes[1, 0].set_xticklabels(batch_sizes)

# All metrics comparison
x = np.arange(len(batch_sizes))
width = 0.2
axes[1, 1].bar(x - width, batch_accuracy, width, label="Accuracy")
axes[1, 1].bar(
    x, [r["precision"] for r in batch_size_results], width, label="Precision"
)
axes[1, 1].bar(
    x + width, [r["recall"] for r in batch_size_results], width, label="Recall"
)
axes[1, 1].set_xlabel("Batch Size", fontsize=12, fontweight="bold")
axes[1, 1].set_ylabel("Score", fontsize=12, fontweight="bold")
axes[1, 1].set_title("Metrics Comparison by Batch Size", fontsize=14, fontweight="bold")
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(batch_sizes)
axes[1, 1].legend()
axes[1, 1].grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Training curves for all batch size configurations
for result in batch_size_results:
    print(f"\nTraining Curves for: {result['model_name']}")
    plot_training_curves(result["history"], result["model_name"])

# Find best batch size based on F1-Score
best_batch_idx = np.argmax([r["f1"] for r in batch_size_results])
best_batch_size = batch_size_results[best_batch_idx]["batch_size"]
best_batch_f1 = batch_size_results[best_batch_idx]["f1"]

In [None]:
print(f"\nBest batch size        : {best_batch_size}")
print(f"Best F1-Score          : {best_batch_f1:.4f}")
print(f"This batch size provides optimal training efficiency and performance.")

## Best Model Selection and Final Evaluation

Now we select the best model from all experiments (1-7) and evaluate it on the test set.

In [None]:
# Collect all experiment results
all_results = {
    "Depth Experiments": depth_results,
    "Width Experiments": width_results,
    "Filter Experiments": filter_results,
    "Dropout Experiments": dropout_results,
    "L2 Experiments": l2_results,
    "Augmented Model": [result_augmented],
    "Batch Size Experiments": batch_size_results,
}

# Find best model overall
best_f1_overall = 0
best_model_name = ""
best_experiment = ""

for exp_name, results in all_results.items():
    for result in results:
        if result["f1"] > best_f1_overall:
            best_f1_overall = result["f1"]
            best_model_name = result["model_name"]
            best_experiment = exp_name

print(f"\nBest Model: {best_model_name}")
print(f"From Experiment: {best_experiment}")
print(f"Validation F1-Score: {best_f1_overall:.4f}")

In [None]:
# Train best model configuration on full training data (including validation)
print("Training Best Model on Full Training Set")

# Combine training and validation sets
X_train_full = np.concatenate([X_train, X_val])
y_train_full = np.concatenate([y_train, y_val])

print(f"\nFull training set size: {X_train_full.shape[0]} samples")

# Determine best batch size to use
if best_experiment == "Batch Size Experiments":
    use_batch_size = best_batch_size
    print(f"Using best batch size from Experiment 7: {use_batch_size}")
else:
    use_batch_size = 32
    print(f"Using default batch size: {use_batch_size}")

# Build the best model (using augmented data if it was the best)
if best_experiment == "Augmented Model":
    print("\nUsing balanced dataset with augmentation...")
    X_train_final, y_train_final = balance_dataset(
        X_train_full, y_train_full, target_samples=3000
    )

    # Shuffle
    indices = np.random.permutation(len(X_train_final))
    X_train_final = X_train_final[indices]
    y_train_final = y_train_final[indices]
else:
    X_train_final = X_train_full
    y_train_final = y_train_full

# Build final model with best hyperparameters
final_model = build_l2_model(
    best_depth,
    best_width,
    len(class_names),
    dropout_rate=best_dropout,
    l2_strength=best_l2,
    base_filters=best_filter,
)

print(f"\nFinal Model Architecture:")
final_model.summary()

# Train without validation split (using all training data)
print(f"\nTraining final model with batch size {use_batch_size}...")
early_stopping = keras.callbacks.EarlyStopping(
    monitor="loss",
    patience=7,
    restore_best_weights=True,
    mode="min",
    verbose=1,
)

history_final = final_model.fit(
    X_train_final,
    y_train_final,
    batch_size=use_batch_size,
    epochs=50,
    callbacks=[early_stopping],
    verbose=1,
)

print(f"\nFinal model training complete!")

In [None]:
# Evaluate final model on test set
print("Final Model Evaluation on Test Set")

# Predict on test set
y_test_pred_probs = final_model.predict(X_test, verbose=0)
y_test_pred = np.argmax(y_test_pred_probs, axis=1)

# Calculate metrics
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average="weighted")
test_recall = recall_score(y_test, y_test_pred, average="weighted")
test_f1 = f1_score(y_test, y_test_pred, average="weighted")

print(f"\nTest Set Performance:")
print(f"  Accuracy             : {test_accuracy:.4f}")
print(f"  Precision            : {test_precision:.4f}")
print(f"  Recall               : {test_recall:.4f}")
print(f"  F1-Score             : {test_f1:.4f}")

In [None]:
print("Classification Report (Test Set):")
print(classification_report(y_test, y_test_pred, target_names=class_names))

In [None]:
# Confusion matrix
print(f"\nConfusion Matrix (Test Set):")
cm_test = plot_confusion_matrix(
    y_test, y_test_pred, class_names, "Final Model - Test Set"
)

## Experiment 8: Transfer Learning

In this experiment, we use pre-trained models (ResNet50 and InceptionV3) with transfer learning to classify EuroSAT images.

In [None]:
from tensorflow.keras.applications import ResNet50, InceptionV3
from tensorflow.keras.applications.resnet50 import preprocess_input as resnet_preprocess
from tensorflow.keras.applications.inception_v3 import (
    preprocess_input as inception_preprocess,
)


def build_transfer_learning_model(
    base_model_name, num_classes, input_shape=(64, 64, 3)
):
    """
    Build a transfer learning model using pre-trained base models.

    Args:
        base_model_name: Name of the base model ('resnet50' or 'inceptionv3')
        num_classes: Number of output classes
        input_shape: Shape of input images

    Returns:
        Compiled Keras model
    """
    if base_model_name.lower() == "resnet50":
        base_model = ResNet50(
            weights="imagenet",
            include_top=False,
            input_shape=input_shape,
            pooling="avg",
        )
    elif base_model_name.lower() == "inceptionv3":
        base_model = InceptionV3(
            weights="imagenet",
            include_top=False,
            input_shape=input_shape,
            pooling="avg",
        )
    else:
        raise ValueError(f"Unknown base model: {base_model_name}")

    # Freeze base model layers
    base_model.trainable = False

    # Build the model
    model = keras.Sequential(
        [
            layers.Input(shape=input_shape),
            base_model,
            layers.Dense(128, activation="relu"),
            layers.Dropout(0.3),
            layers.BatchNormalization(),
            layers.Dense(128, activation="relu"),
            layers.Dropout(0.3),
            layers.BatchNormalization(),
            layers.Dense(num_classes, activation="softmax"),
        ]
    )

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss="sparse_categorical_crossentropy",
        metrics=[SparseF1Score(num_classes=num_classes, average="weighted")],
    )

    return model

### ResNet50 Transfer Learning

In [None]:
# Build ResNet50 model
print("Training ResNet50 Model")

resnet_model = build_transfer_learning_model("resnet50", len(class_names))

print(f"\nResNet50 Model Architecture:")
resnet_model.summary()

# Train ResNet50
resnet_result = evaluate_model(
    resnet_model,
    X_train,
    X_val,
    y_train,
    y_val,
    "ResNet50 Transfer Learning",
    epochs=30,
)

In [None]:
print("ResNet50 Validation Results:")
print(f"  F1-Score             : {resnet_result['f1']:.4f}")
print(f"  Accuracy             : {resnet_result['accuracy']:.4f}")

In [None]:
# Plot training curves
plot_training_curves(resnet_result["history"], "ResNet50 Transfer Learning")

In [None]:
# Evaluate ResNet50 on test set
print("ResNet50 - Test Set Evaluation")

# Predict on test set
y_test_pred_resnet_probs = resnet_model.predict(X_test, verbose=0)
y_test_pred_resnet = np.argmax(y_test_pred_resnet_probs, axis=1)

# Calculate metrics
test_accuracy_resnet = accuracy_score(y_test, y_test_pred_resnet)
test_precision_resnet = precision_score(y_test, y_test_pred_resnet, average="weighted")
test_recall_resnet = recall_score(y_test, y_test_pred_resnet, average="weighted")
test_f1_resnet = f1_score(y_test, y_test_pred_resnet, average="weighted")

print(f"\nResNet50 Test Set Performance:")
print(f"  Accuracy             : {test_accuracy_resnet:.4f}")
print(f"  Precision            : {test_precision_resnet:.4f}")
print(f"  Recall               : {test_recall_resnet:.4f}")
print(f"  F1-Score             : {test_f1_resnet:.4f}")

In [None]:
print("Classification Report (Test Set):")
print(classification_report(y_test, y_test_pred_resnet, target_names=class_names))

In [None]:
# Confusion matrix
cm_resnet = plot_confusion_matrix(
    y_test, y_test_pred_resnet, class_names, "ResNet50 - Test Set"
)

### InceptionV3 Transfer Learning (GoogLeNet)

In [None]:
# Build InceptionV3 model
print("Training InceptionV3 Transfer Learning Model")

inception_model = build_transfer_learning_model("inceptionv3", len(class_names))

print(f"\nInceptionV3 Model Architecture:")
inception_model.summary()

# Train InceptionV3
inception_result = evaluate_model(
    inception_model,
    X_train,
    X_val,
    y_train,
    y_val,
    "InceptionV3 Transfer Learning",
    epochs=30,
)

In [None]:
print("InceptionV3 Validation Results:")
print(f"  F1-Score             : {inception_result['f1']:.4f}")
print(f"  Accuracy             : {inception_result['accuracy']:.4f}")

In [None]:
# Plot training curves
plot_training_curves(inception_result["history"], "InceptionV3 Transfer Learning")

In [None]:
# Evaluate InceptionV3 on test set
print("InceptionV3 - Test Set Evaluation")

# Predict on test set
y_test_pred_inception_probs = inception_model.predict(X_test, verbose=0)
y_test_pred_inception = np.argmax(y_test_pred_inception_probs, axis=1)

# Calculate metrics
test_accuracy_inception = accuracy_score(y_test, y_test_pred_inception)
test_precision_inception = precision_score(
    y_test, y_test_pred_inception, average="weighted"
)
test_recall_inception = recall_score(y_test, y_test_pred_inception, average="weighted")
test_f1_inception = f1_score(y_test, y_test_pred_inception, average="weighted")

print(f"\nInceptionV3 Test Set Performance:")
print(f"  Accuracy             : {test_accuracy_inception:.4f}")
print(f"  Precision            : {test_precision_inception:.4f}")
print(f"  Recall               : {test_recall_inception:.4f}")
print(f"  F1-Score             : {test_f1_inception:.4f}")

In [None]:
print("Classification Report (Test Set):")
print(classification_report(y_test, y_test_pred_inception, target_names=class_names))

In [None]:
# Confusion matrix
cm_inception = plot_confusion_matrix(
    y_test, y_test_pred_inception, class_names, "InceptionV3 - Test Set"
)

## Final Comparison: All Models on Test Set

In [None]:
# Compare all models on test set
test_comparison = pd.DataFrame(
    {
        "Model": ["Custom CNN (Best)", "ResNet50", "InceptionV3"],
        "Test Accuracy": [
            f"{test_accuracy:.4f}",
            f"{test_accuracy_resnet:.4f}",
            f"{test_accuracy_inception:.4f}",
        ],
        "Test Precision": [
            f"{test_precision:.4f}",
            f"{test_precision_resnet:.4f}",
            f"{test_precision_inception:.4f}",
        ],
        "Test Recall": [
            f"{test_recall:.4f}",
            f"{test_recall_resnet:.4f}",
            f"{test_recall_inception:.4f}",
        ],
        "Test F1-Score": [
            f"{test_f1:.4f}",
            f"{test_f1_resnet:.4f}",
            f"{test_f1_inception:.4f}",
        ],
    }
)

print("Final Test Comparison")
display(test_comparison)

In [None]:
# Save final test comparison results to CSV
test_comparison.to_csv("results/results_final_test_comparison.csv", index=False)
print("\nFinal test results saved to: results_final_test_comparison.csv")

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

models = ["Custom CNN", "ResNet50", "InceptionV3"]
test_f1_scores = [test_f1, test_f1_resnet, test_f1_inception]
test_accuracies = [test_accuracy, test_accuracy_resnet, test_accuracy_inception]

# F1-Score comparison
axes[0].bar(
    models, test_f1_scores, color=["steelblue", "coral", "green"], edgecolor="black"
)
axes[0].set_ylabel("F1-Score", fontsize=12, fontweight="bold")
axes[0].set_title("Test Set F1-Score Comparison", fontsize=14, fontweight="bold")
axes[0].grid(axis="y", alpha=0.3)
axes[0].set_ylim([0, 1])

# Accuracy comparison
axes[1].bar(
    models, test_accuracies, color=["steelblue", "coral", "green"], edgecolor="black"
)
axes[1].set_ylabel("Accuracy", fontsize=12, fontweight="bold")
axes[1].set_title("Test Set Accuracy Comparison", fontsize=14, fontweight="bold")
axes[1].grid(axis="y", alpha=0.3)
axes[1].set_ylim([0, 1])

plt.tight_layout()
plt.show()

In [None]:
# Determine best overall model
best_test_f1 = max(test_f1, test_f1_resnet, test_f1_inception)
if best_test_f1 == test_f1:
    best_overall_model = "Custom CNN (Best)"
elif best_test_f1 == test_f1_resnet:
    best_overall_model = "ResNet50"
else:
    best_overall_model = "InceptionV3"

print(f"Best Overall Model on Test Set: {best_overall_model}")
print(f"Test F1-Score: {best_test_f1:.4f}")

## Summary of All Experiments

Complete overview of all experiments and their results.

In [None]:
print("Complete Experiment Summary - EuroSAT CNN Architecture Experiments")

print(f"\nExperiment 1: Network Depth")
print(f"  - Configurations tested: {[2, 3, 4, 5]} blocks")
print(f"  - Best configuration   : {best_depth} blocks")
print(f"  - Best val F1-Score    : {best_depth_f1:.4f}")

print(f"\nExperiment 2: Block Width")
print(f"  - Configurations tested: {[2, 3, 4]} Conv2D layers per block")
print(f"  - Best configuration   : {best_width} Conv2D layers per block")
print(f"  - Best val F1-Score    : {best_width_f1:.4f}")

print(f"\nExperiment 3: Number of Filters")
print(f"  - Configurations tested: {[16, 32, 64, 128]} base filters")
print(f"  - Best configuration   : {best_filter} base filters")
print(f"  - Best val F1-Score    : {best_filter_f1:.4f}")

print(f"\nExperiment 4: Dropout Regularization")
print(f"  - Configurations tested: {dropout_configs}")
print(f"  - Best configuration   : {best_dropout} dropout rate")
print(f"  - Best val F1-Score    : {best_dropout_f1:.4f}")

print(f"\nExperiment 5: L2 Regularization")
print(f"  - Configurations tested: {l2_configs}")
print(f"  - Best configuration   : {best_l2} L2 strength")
print(f"  - Best val F1-Score    : {best_l2_f1:.4f}")

print(f"\nExperiment 6: Data Augmentation")
print(f"  - Balanced classes to  : 3000 samples each")
print(f"  - Val F1-Score         : {result_augmented['f1']:.4f}")

print(f"\nExperiment 7: Batch Size Optimization")
print(f"  - Configurations tested: {batch_size_configs}")
print(f"  - Best configuration   : {best_batch_size} batch size")
print(f"  - Best val F1-Score    : {best_batch_f1:.4f}")

print(f"\nExperiment 8: Transfer Learning")
print(f"  - ResNet50 val F1      : {resnet_result['f1']:.4f}")
print(f"  - InceptionV3 val F1   : {inception_result['f1']:.4f}")

In [None]:
print("Final Test Set Results")
print(f"  Custom CNN    - F1: {test_f1:.4f}, Accuracy: {test_accuracy:.4f}")
print(
    f"  ResNet50      - F1: {test_f1_resnet:.4f}, Accuracy: {test_accuracy_resnet:.4f}"
)
print(
    f"  InceptionV3   - F1: {test_f1_inception:.4f}, Accuracy: {test_accuracy_inception:.4f}"
)
print(f"\n  Winner: {best_overall_model} with F1-Score: {best_test_f1:.4f}")