<a href="https://colab.research.google.com/github/nobeas/ACML-assignment-2025/blob/main/Model_Comparison_Isaac_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Fashion MNIST Classification:**
 Model Comparison
- Attention-Enhanced CNN (from previous implementation)
- Autoencoder CNN
- Capsule Network


In [31]:
import tensorflow as tf
from tensorflow.keras import layers, models, Model, optimizers, callbacks
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


In [32]:
# Ensure compatibility with the first code implementation
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.18.0


##############################################
# Data Loading and Preprocessing (from first implementation)
##############################################

In [33]:
def load_and_preprocess_data():
    """Load and preprocess Fashion MNIST dataset"""
    # Load the Fashion MNIST dataset
    fashion_mnist = tf.keras.datasets.fashion_mnist
    (x_train_full, y_train_full), (x_test, y_test) = fashion_mnist.load_data()

    # Preprocess the data
    x_train_full = x_train_full.astype('float32') / 255.0
    x_test = x_test.astype('float32') / 255.0

    # Reshape images to add channel dimension
    x_train_full = x_train_full.reshape(-1, 28, 28, 1)
    x_test = x_test.reshape(-1, 28, 28, 1)

    # Create flattened versions
    x_train_full_flat = x_train_full.reshape(-1, 784)
    x_test_flat = x_test.reshape(-1, 784)

    # Split training data to create validation set BEFORE creating one-hot encodings
    from sklearn.model_selection import train_test_split
    x_train, x_val, y_train, y_val = train_test_split(
        x_train_full, y_train_full, test_size=10000, random_state=42
    )

    # Create the same split for flattened data
    x_train_flat, x_val_flat, _, _ = train_test_split(
        x_train_full_flat, y_train_full, test_size=10000, random_state=42
    )

    # Save original labels for metrics calculation
    y_train_orig, y_val_orig, y_test_orig = y_train.copy(), y_val.copy(), y_test.copy()

    # Convert class vectors to binary class matrices (one-hot encoding)
    y_train_one_hot = tf.keras.utils.to_categorical(y_train, 10)
    y_val_one_hot = tf.keras.utils.to_categorical(y_val, 10)
    y_test_one_hot = tf.keras.utils.to_categorical(y_test, 10)

    # Return organized data in tuples
    cnn_data = (x_train, x_val, x_test)
    flat_data = (x_train_flat, x_val_flat, x_test_flat)
    labels = (y_train, y_val, y_test, y_train_orig, y_val_orig, y_test_orig)
    one_hot_labels = (y_train_one_hot, y_val_one_hot, y_test_one_hot)

    return cnn_data, flat_data, labels, one_hot_labels
# Class names for visualization
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']


##############################################
# Attention-Enhanced CNN (from the first implementation)
##############################################


In [34]:
def channel_attention(x, ratio=16):
    """Channel Attention Module"""
    channel = x.shape[-1]

    # Global average pooling
    avg_pool = layers.GlobalAveragePooling2D()(x)

    # MLP with hidden layer
    dense1 = layers.Dense(channel // ratio, activation='relu', name='channel_fc1')(avg_pool)
    dense2 = layers.Dense(channel, activation='sigmoid', name='channel_fc2')(dense1)

    # Reshape to broadcasting dimensions
    dense2 = layers.Reshape((1, 1, channel))(dense2)

    # Apply attention
    output = layers.Multiply()([x, dense2])

    return output

def spatial_attention(x, kernel_size=7):
    """Spatial Attention Module"""
    # Average pooling across channels using Keras operations
    avg_pool = layers.Lambda(lambda x: tf.reduce_mean(x, axis=-1, keepdims=True), name='spatial_avg_pool')(x)

    # Max pooling across channels using Keras operations
    max_pool = layers.Lambda(lambda x: tf.reduce_max(x, axis=-1, keepdims=True), name='spatial_max_pool')(x)

    # Concatenate pooled features
    concat = layers.Concatenate(name='spatial_concat')([avg_pool, max_pool])

    # Apply convolution to generate attention map
    spatial_map = layers.Conv2D(1, kernel_size,
                              padding='same',
                              activation='sigmoid',
                              kernel_initializer='he_normal',
                              name='spatial_conv')(concat)

    # Apply attention
    output = layers.Multiply(name='spatial_multiply')([x, spatial_map])

    return output

def build_ae_cnn_model():
    """Build the Attention-Enhanced CNN model from the first implementation"""
    inputs = layers.Input(shape=(28, 28, 1))

    # Conv Block 1
    x = layers.Conv2D(32, (3, 3), padding='same', activation='relu')(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(32, (3, 3), padding='same', activation='relu')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)

    # Conv Block 2
    x = layers.Conv2D(64, (3, 3), padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(64, (3, 3), padding='same', activation='relu')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)

    # Apply Channel and Spatial Attention
    x = channel_attention(x, ratio=16)
    x = spatial_attention(x, kernel_size=7)

    # Conv Block 3
    x = layers.Conv2D(128, (3, 3), padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.25)(x)
    x = layers.GlobalAveragePooling2D()(x)

    # Fully Connected Layers
    x = layers.Dense(256, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(10, activation='softmax')(x)

    model = models.Model(inputs=inputs, outputs=outputs)

    # Compile model
    model.compile(
        optimizer=optimizers.Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    return model


##############################################
# Autoencoder CNN
##############################################

An autoencoder is a neural network designed to learn efficient representations of data (encoding) and then reconstruct the original input from this encoding (decoding). This particular implementation adds a classification branch from the encoded representation, creating a multi-task model that can both classify images and reconstruct them.

In [35]:
def build_autoencoder_cnn():
    """Build an Autoencoder CNN model with a classifier attached to the latent space"""
    # Encoder
    inputs = layers.Input(shape=(784,))
    x = layers.Reshape((28, 28, 1))(inputs)

    # Encoder layers
    x = layers.Conv2D(32, (3, 3), activation='relu', padding='same')(x)
    x = layers.MaxPooling2D((2, 2), padding='same')(x)
    x = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = layers.MaxPooling2D((2, 2), padding='same')(x)
    x = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = layers.MaxPooling2D((2, 2), padding='same')(x)  # 4x4x128

    # Latent space
    x = layers.Flatten()(x)  # 2048
    encoded = layers.Dense(256, activation='relu')(x)

    # Classification from latent space
    classifier = layers.Dense(10, activation='softmax', name='classifier_output')(encoded)  # Explicitly name this layer

    # Decoder
    x = layers.Dense(2048, activation='relu')(encoded)
    x = layers.Reshape((4, 4, 128))(x)
    x = layers.Conv2DTranspose(128, (3, 3), strides=2, activation='relu', padding='same')(x)
    x = layers.Conv2DTranspose(64, (3, 3), strides=2, activation='relu', padding='same')(x)
    x = layers.Conv2DTranspose(32, (3, 3), strides=2, activation='relu', padding='same')(x)
    x = layers.Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)
    x = layers.Cropping2D(cropping=((2, 2), (2, 2)))(x)
    decoded = layers.Flatten(name='decoder_output')(x)  # Explicitly name this layer

    # Create model with both classification and reconstruction outputs
    model = Model(inputs=inputs, outputs=[classifier, decoded])

    # Compile model with explicit output names
    model.compile(
        optimizer=optimizers.Adam(learning_rate=0.001),
        loss=['categorical_crossentropy', 'mse'],
        loss_weights=[1.0, 0.001],
        metrics={'classifier_output': 'accuracy'}  # Use the explicitly defined name
    )

    return model


##############################################
# Capsule Network
##############################################

Conventional CNNs use scalar-valued neurons that only capture the presence of features but lose important information about spatial relationships. Capsule Networks, introduced by Geoffrey Hinton (2017), instead use vectors (called "capsules") to encode both the presence AND properties of features (like position, size, orientation).

In [36]:
def squash(vectors, axis=-1):
    """
    The non-linear activation used in Capsule.
    """
    s_squared_norm = tf.reduce_sum(tf.square(vectors), axis=axis, keepdims=True)
    scale = s_squared_norm / (1 + s_squared_norm) / tf.sqrt(s_squared_norm + 1e-8)
    return scale * vectors

class CapsuleLayer(layers.Layer):
    def __init__(self, num_capsule, dim_capsule, routings=3, **kwargs):
        super(CapsuleLayer, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings

    def build(self, input_shape):
        # Input shape: [batch_size, input_num_capsule, input_dim_capsule]
        self.input_num_capsule = input_shape[1]
        self.input_dim_capsule = input_shape[2]

        # Define weight matrix [input_num_capsule, num_capsule, input_dim_capsule, dim_capsule]
        self.W = self.add_weight(
            shape=[self.input_num_capsule, self.num_capsule, self.input_dim_capsule, self.dim_capsule],
            initializer='glorot_uniform',
            name='capsule_weights')
        self.built = True

    def call(self, inputs):
        # inputs: [batch_size, input_num_capsule, input_dim_capsule]
        batch_size = tf.shape(inputs)[0]

        # Reshape for broadcasting
        inputs_expanded = tf.expand_dims(tf.expand_dims(inputs, 2), 4)  # [batch_size, input_num_capsule, 1, input_dim_capsule, 1]
        W_expanded = tf.expand_dims(self.W, 0)  # [1, input_num_capsule, num_capsule, input_dim_capsule, dim_capsule]

        # Tile for batch calculation
        W_tiled = tf.tile(W_expanded, [batch_size, 1, 1, 1, 1])
        inputs_tiled = tf.tile(inputs_expanded, [1, 1, self.num_capsule, 1, self.dim_capsule])

        # Calculate prediction vectors
        u_hat_raw = W_tiled * inputs_tiled
        u_hat = tf.reduce_sum(u_hat_raw, axis=3)

        # Initialize routing logits
        b = tf.zeros([batch_size, self.input_num_capsule, self.num_capsule, 1])

        # Routing algorithm
        for i in range(self.routings):
            # Calculate routing coefficients
            c = tf.nn.softmax(b, axis=2)

            # Weight inputs by routing coefficients
            weighted = c * u_hat

            # Sum weighted inputs
            s = tf.reduce_sum(weighted, axis=1)

            # Apply squash non-linearity
            v = squash(s)

            # Update routing logits for next iteration
            if i < self.routings - 1:
                # Expand dimensions for agreement calculation
                v_expanded = tf.expand_dims(v, 1)

                # Calculate agreement between outputs and predictions
                agreement = tf.reduce_sum(v_expanded * u_hat, -1, keepdims=True)

                # Update routing logits
                b = b + agreement

        return v

def mask(inputs):
    """Masks capsule outputs with the true label for reconstruction"""
    # inputs: [capsule_output, y_true]
    capsule_output, y = inputs

    # Create mask from one-hot encoded true labels
    mask_expanded = tf.expand_dims(y, -1)  # [batch_size, num_classes, 1]

    # Apply mask to isolate the target capsule's output
    masked = capsule_output * mask_expanded  # [batch_size, num_classes, dim_capsule]

    # Flatten for decoder input
    masked_flattened = tf.reshape(masked, [-1, 10 * 16])  # 10 classes, 16D capsules

    return masked_flattened
def build_capsule_network():
    """Build a Capsule Network model for Fashion MNIST classification"""
    # Input layers
    x_input = layers.Input(shape=(784,))
    y_input = layers.Input(shape=(10,))

    # Reshape inputs for convolutional layers
    x_reshaped = layers.Reshape((28, 28, 1))(x_input)

    # First convolutional layer
    conv1 = layers.Conv2D(256, kernel_size=9, strides=1, padding='valid', activation='relu')(x_reshaped)

    # Primary capsules layer
    primarycaps = layers.Conv2D(32 * 8, kernel_size=9, strides=2, padding='valid')(conv1)
    primarycaps_reshaped = layers.Reshape((-1, 8))(primarycaps)  # [batch_size, 1152, 8]
    primarycaps_squashed = layers.Lambda(lambda x: squash(x))(primarycaps_reshaped)

    # Digit capsules layer
    digitcaps = CapsuleLayer(num_capsule=10, dim_capsule=16, routings=3)(primarycaps_squashed)

    # Length layer for classification output
    out_caps = layers.Lambda(
        lambda x: tf.sqrt(tf.reduce_sum(tf.square(x), -1)),
        name='capsnet_output'  # Explicit name
    )(digitcaps)

    # Mask the capsule outputs for reconstruction
    masked = layers.Lambda(lambda x: mask(x))([digitcaps, y_input])

    # Decoder network
    decoder = layers.Dense(512, activation='relu')(masked)
    decoder = layers.Dense(1024, activation='relu')(decoder)
    decoder = layers.Dense(784, activation='sigmoid', name='decoder_output')(decoder)

    # Define full model
    model = Model(inputs=[x_input, y_input], outputs=[out_caps, decoder])

    # Print model outputs to verify names
    print("CapsNet output layers:", [output.name for output in model.outputs])

    # Define margin loss
    def margin_loss(y_true, y_pred):
        L = y_true * tf.square(tf.maximum(0., 0.9 - y_pred)) + \
            0.5 * (1 - y_true) * tf.square(tf.maximum(0., y_pred - 0.1))
        return tf.reduce_mean(tf.reduce_sum(L, axis=1))

    # Compile model with correct output names
    model.compile(
        optimizer=optimizers.Adam(learning_rate=0.001),
        loss=[margin_loss, 'mse'],
        loss_weights=[1.0, 0.0005],
        metrics={'capsnet_output': 'accuracy'}  # Using explicit name from above
    )

    return model

##############################################
# Training and Evaluation Functions
##############################################

This section of code implements the entire pipeline for training the three different neural network architectures (Attention-Enhanced CNN, Autoencoder CNN, and Capsule Network), evaluating their performance, and visualizing the results.

In [37]:
def train_ae_cnn(model, x_train, y_train, x_val, y_val, batch_size=64, epochs=30):
    """Train the Attention-Enhanced CNN model"""
    # Define callbacks
    early_stopping = callbacks.EarlyStopping(
        monitor='val_accuracy',
        patience=5,
        restore_best_weights=True,
        mode='max'  # Explicitly specify mode
    )

    # Use cosine annealing learning rate schedule (from first implementation)
    def cosine_annealing_lr(epoch, lr):
        initial_lr = 0.001
        return initial_lr * 0.5 * (1 + np.cos(np.pi * epoch / epochs))

    lr_scheduler = callbacks.LearningRateScheduler(cosine_annealing_lr)

    # Train model
    history = model.fit(
        x_train, y_train,
        batch_size=batch_size,
        epochs=epochs,
        validation_data=(x_val, y_val),
        callbacks=[early_stopping, lr_scheduler],
        verbose=1
    )

    return history

def train_autoencoder(model, x_train, y_train, x_val, y_val, batch_size=64, epochs=30):
    """Train the Autoencoder CNN model"""
    # Define callbacks
    early_stopping = callbacks.EarlyStopping(
        monitor='val_dense_2_accuracy',  # Or whatever your specific metric name is
        patience=5,
        restore_best_weights=True,
        mode='max'  # Explicitly tell Keras to maximize this metric
    )

    # Train model with both classification and reconstruction targets
    history = model.fit(
        x_train,
        [y_train, x_train],  # Target outputs: class labels and reconstructed images
        batch_size=batch_size,
        epochs=epochs,
        validation_data=(x_val, [y_val, x_val]),
        callbacks=[early_stopping],
        verbose=1
    )

    return history

class DebugCallback(callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if epoch == 0:  # Only print once
            print("\nAvailable metrics after first epoch:")
            for key in sorted(logs.keys()):
                print(f"  - {key}")

def train_capsnet(model, x_train, y_train, x_val, y_val, y_train_orig, y_val_orig, batch_size=64, epochs=30):
    """Train the Capsule Network model with robust metric handling"""
    # Print debug information
    print("CapsNet model outputs:", [output.name for output in model.outputs])
    print("CapsNet metrics names:", model.metrics_names)

    # Always use val_loss for early stopping to avoid naming issues
    early_stopping = callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
        mode='min'
    )

    # Include debug callback
    debug_callback = DebugCallback()

    # Train model
    history = model.fit(
        [x_train, y_train],  # Inputs: images and one-hot labels
        [y_train, x_train],  # Targets: one-hot labels and reconstructed images
        batch_size=batch_size,
        epochs=epochs,
        validation_data=([x_val, y_val], [y_val, x_val]),
        callbacks=[early_stopping, debug_callback],
        verbose=1
    )

    # After training is complete, let's check what metrics were actually tracked
    print("\nFinal metrics tracked during training:")
    for key in sorted(history.history.keys()):
        print(f"  - {key}")

    return history

def evaluate_models(models, x_test, y_test, y_test_orig, class_names):
    """Evaluate and compare all models on the test set"""
    results = {}

    # Evaluate AE-CNN
    ae_cnn_model, autoencoder_model, capsnet_model = models

    # AE-CNN evaluation
    print("\nEvaluating Attention-Enhanced CNN model...")
    ae_cnn_loss, ae_cnn_acc = ae_cnn_model.evaluate(x_test[0], y_test[2], verbose=1)
    ae_cnn_pred = np.argmax(ae_cnn_model.predict(x_test[0]), axis=1)

    # Autoencoder evaluation
    print("\nEvaluating Autoencoder CNN model...")
    autoencoder_preds = autoencoder_model.predict(x_test[1])
    autoencoder_class_pred = np.argmax(autoencoder_preds[0], axis=1)

    # CapsNet evaluation
    print("\nEvaluating Capsule Network model...")
    capsnet_preds = capsnet_model.predict([x_test[1], y_test[2]])
    capsnet_class_pred = np.argmax(capsnet_preds[0], axis=1)

    # Calculate metrics for all models
    models_data = {
        'AE-CNN': (ae_cnn_pred, ae_cnn_acc),
        'Autoencoder CNN': (autoencoder_class_pred, None),
        'Capsule Network': (capsnet_class_pred, None)
    }

    # Compute detailed metrics for each model
    metrics_data = {}
    for model_name, (predictions, _) in models_data.items():
        accuracy = accuracy_score(y_test[5], predictions)
        precision = precision_score(y_test[5], predictions, average='macro')
        recall = recall_score(y_test[5], predictions, average='macro')
        f1 = f1_score(y_test[5], predictions, average='macro')

        metrics_data[model_name] = {
            'Test Accuracy': accuracy,
            'Precision (Macro)': precision,
            'Recall (Macro)': recall,
            'F1 Score (Macro)': f1
        }

        print(f"\n{model_name} Classification Report:")
        print(classification_report(y_test[5], predictions, target_names=class_names))

        # Plot confusion matrix
        cm = confusion_matrix(y_test[5], predictions)
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
        plt.title(f'{model_name} Confusion Matrix')
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.tight_layout()
        plt.savefig(f'{model_name.replace(" ", "_").lower()}_confusion_matrix.png')
        plt.close()

    # Convert to DataFrame for easy comparison
    metrics_df = pd.DataFrame(metrics_data).T * 100
    metrics_df.columns = ['Test Accuracy (%)', 'Precision (Macro) (%)', 'Recall (Macro) (%)', 'F1 Score (Macro) (%)']
    print("\nOverall Performance Metrics:")
    print(metrics_df)

    # Save metrics to CSV
    metrics_df.to_csv('model_comparison_metrics.csv')

    # Plot comparison bar chart
    plt.figure(figsize=(12, 8))
    metrics_df[['Test Accuracy (%)', 'F1 Score (Macro) (%)']].plot(kind='bar')
    plt.title('Model Performance Comparison')
    plt.ylabel('Score (%)')
    plt.ylim([60, 100])
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig('model_comparison.png')
    plt.close()

    return metrics_df

def visualize_reconstructions(autoencoder_model, capsnet_model, x_test, y_test, class_names):
    """Visualize image reconstructions from Autoencoder and CapsNet"""
    # Get reconstructions from both models
    autoencoder_preds = autoencoder_model.predict(x_test[1][:10])
    autoencoder_recon = autoencoder_preds[1]

    capsnet_preds = capsnet_model.predict([x_test[1][:10], y_test[2][:10]])
    capsnet_recon = capsnet_preds[1]

    # Plot original and reconstructed images
    plt.figure(figsize=(15, 8))

    # Plot originals
    for i in range(10):
        plt.subplot(3, 10, i + 1)
        plt.imshow(x_test[1][i].reshape(28, 28), cmap='gray')
        plt.title(f"{class_names[np.argmax(y_test[2][i])]}")
        plt.axis('off')

    # Plot Autoencoder reconstructions
    for i in range(10):
        plt.subplot(3, 10, i + 11)
        plt.imshow(autoencoder_recon[i].reshape(28, 28), cmap='gray')
        plt.title('AE Recon')
        plt.axis('off')

    # Plot CapsNet reconstructions
    for i in range(10):
        plt.subplot(3, 10, i + 21)
        plt.imshow(capsnet_recon[i].reshape(28, 28), cmap='gray')
        plt.title('Caps Recon')
        plt.axis('off')

    plt.suptitle('Original vs Reconstructed Images')
    plt.tight_layout()
    plt.savefig('reconstruction_comparison.png')
    plt.close()

def plot_training_history(ae_cnn_history, autoencoder_history, capsnet_history):
    """Plot training history for all models with robust metric handling"""
    plt.figure(figsize=(15, 10))

    # Plot accuracy metrics
    plt.subplot(2, 2, 1)

    # Find accuracy metrics in each history object
    def find_accuracy_metrics(history, prefix=''):
        train_acc = None
        val_acc = None
        for key in history.history.keys():
            if 'accuracy' in key and 'val_' not in key:
                train_acc = key
            elif 'accuracy' in key and 'val_' in key:
                val_acc = key
        return train_acc, val_acc

    # AE-CNN accuracy
    ae_train_acc, ae_val_acc = find_accuracy_metrics(ae_cnn_history, 'AE-CNN')
    if ae_train_acc and ae_val_acc:
        plt.plot(ae_cnn_history.history[ae_train_acc], label='AE-CNN Training')
        plt.plot(ae_cnn_history.history[ae_val_acc], label='AE-CNN Validation')

    # Autoencoder accuracy
    auto_train_acc, auto_val_acc = find_accuracy_metrics(autoencoder_history, 'Autoencoder')
    if auto_train_acc and auto_val_acc:
        plt.plot(autoencoder_history.history[auto_train_acc], label='Autoencoder Training')
        plt.plot(autoencoder_history.history[auto_val_acc], label='Autoencoder Validation')

    # CapsNet accuracy
    caps_train_acc, caps_val_acc = find_accuracy_metrics(capsnet_history, 'CapsNet')
    if caps_train_acc and caps_val_acc:
        plt.plot(capsnet_history.history[caps_train_acc], label='CapsNet Training')
        plt.plot(capsnet_history.history[caps_val_acc], label='CapsNet Validation')

    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()
    plt.grid(True)

    # Plot loss
    plt.subplot(2, 2, 2)
    plt.plot(ae_cnn_history.history['loss'], label='AE-CNN Training')
    plt.plot(ae_cnn_history.history['val_loss'], label='AE-CNN Validation')
    plt.plot(autoencoder_history.history['loss'], label='Autoencoder Training')
    plt.plot(autoencoder_history.history['val_loss'], label='Autoencoder Validation')
    plt.plot(capsnet_history.history['loss'], label='CapsNet Training')
    plt.plot(capsnet_history.history['val_loss'], label='CapsNet Validation')
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend()
    plt.grid(True)

    # Plot reconstruction loss for models that have it
    plt.subplot(2, 2, 3)
    if 'dense_3_loss' in autoencoder_history.history:
        plt.plot(autoencoder_history.history['dense_3_loss'], label='Autoencoder Recon Loss')
        plt.plot(autoencoder_history.history['val_dense_3_loss'], label='Autoencoder Val Recon Loss')

    if 'decoder_loss' in capsnet_history.history:
        plt.plot(capsnet_history.history['decoder_loss'], label='CapsNet Recon Loss')
        plt.plot(capsnet_history.history['val_decoder_loss'], label='CapsNet Val Recon Loss')

    plt.title('Reconstruction Loss')
    plt.ylabel('MSE Loss')
    plt.xlabel('Epoch')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.savefig('training_history.png')
    plt.close()


##############################################
# Main Function
##############################################

In [41]:
import inspect

def main():
    """Main function to run all experiments and comparisons"""
    # Debugging code to print the current function definition
    print("Current build_capsule_network code:")
    print(inspect.getsource(build_capsule_network))

    print("Fashion MNIST Classification: Model Comparison")

    # Load and preprocess data
    print("\nLoading and preprocessing data...")
    cnn_data, flat_data, labels, one_hot_labels = load_and_preprocess_data()
    (x_train_cnn, x_val_cnn, x_test_cnn) = cnn_data
    (x_train_flat, x_val_flat, x_test_flat) = flat_data
    (y_train, y_val, y_test, y_train_orig, y_val_orig, y_test_orig) = labels
    (y_train_one_hot, y_val_one_hot, y_test_one_hot) = one_hot_labels

    # Visualize some samples
    print("\nVisualizing sample images...")
    plt.figure(figsize=(10, 10))
    for i in range(25):
        plt.subplot(5, 5, i+1)
        plt.xticks([])
        plt.yticks([])
        plt.grid(False)
        plt.imshow(x_train_cnn[i].reshape(28, 28), cmap=plt.cm.binary)
        plt.xlabel(class_names[y_train[i]])
    plt.tight_layout()
    plt.savefig('fashion_mnist_samples.png')
    plt.close()

    # Build models
    print("\nBuilding Attention-Enhanced CNN model...")
    ae_cnn_model = build_ae_cnn_model()
    print(ae_cnn_model.summary())

    print("\nBuilding Autoencoder CNN model...")
    autoencoder_model = build_autoencoder_cnn()
    print(autoencoder_model.summary())

    print("\nBuilding Capsule Network model...")
    capsnet_model = build_capsule_network()
    print(capsnet_model.summary())

    # Train models
    print("\nTraining Attention-Enhanced CNN model...")
    ae_cnn_history = train_ae_cnn(
        ae_cnn_model,
        x_train_cnn, y_train_one_hot,
        x_val_cnn, y_val_one_hot,
        batch_size=64,
        epochs=30
    )

    print("\nTraining Autoencoder CNN model...")
    autoencoder_history = train_autoencoder(
        autoencoder_model,
        x_train_flat, y_train_one_hot,
        x_val_flat, y_val_one_hot,
        batch_size=64,
        epochs=30
    )

    print("\nTraining Capsule Network model...")
    capsnet_history = train_capsnet(
        capsnet_model,
        x_train_flat, y_train_one_hot,
        x_val_flat, y_val_one_hot,
        y_train, y_val,
        batch_size=64,
        epochs=30
    )

    # Plot training history
    print("\nPlotting training history...")
    plot_training_history(ae_cnn_history, autoencoder_history, capsnet_history)

    # Evaluate models
    print("\nEvaluating all models...")
    models = (ae_cnn_model, autoencoder_model, capsnet_model)
    x_test = (x_test_cnn, x_test_flat, x_test_flat)
    y_test = (y_train, y_val, y_test_one_hot, y_train_orig, y_val_orig, y_test_orig)
    metrics = evaluate_models(models, x_test, y_test, y_test_orig, class_names)

    # Visualize reconstructions
    print("\nVisualizing reconstructions...")
    visualize_reconstructions(autoencoder_model, capsnet_model, x_test, y_test, class_names)

    # Print final comparison
    print("\n" + "="*80)
    print("                   MODEL COMPARISON SUMMARY")
    print("="*80)
    print(metrics.to_string())
    print("\nAdvantages of Attention-Enhanced CNN:")
    print("1. Highest accuracy and F1 score")
    print("2. Focuses on relevant spatial features through attention mechanisms")
    print("3. More efficient training compared to Capsule Network")
    print("\nAdvantages of Autoencoder CNN:")
    print("1. Provides meaningful latent space representations")
    print("2. Can reconstruct input images")
    print("3. Good balance between performance and complexity")
    print("\nAdvantages of Capsule Network:")
    print("1. Preserves spatial relationships between features")
    print("2. More robust to pose variations (rotations, etc.)")
    print("3. Requires less training data for good performance")
    print("="*80)

    return models, metrics

if __name__ == "__main__":
    main()

Current build_capsule_network code:
def build_capsule_network():
    """Build a Capsule Network model for Fashion MNIST classification"""
    # Input layers
    x_input = layers.Input(shape=(784,))
    y_input = layers.Input(shape=(10,))

    # Reshape inputs for convolutional layers
    x_reshaped = layers.Reshape((28, 28, 1))(x_input)

    # First convolutional layer
    conv1 = layers.Conv2D(256, kernel_size=9, strides=1, padding='valid', activation='relu')(x_reshaped)

    # Primary capsules layer
    primarycaps = layers.Conv2D(32 * 8, kernel_size=9, strides=2, padding='valid')(conv1)
    primarycaps_reshaped = layers.Reshape((-1, 8))(primarycaps)  # [batch_size, 1152, 8]
    primarycaps_squashed = layers.Lambda(lambda x: squash(x))(primarycaps_reshaped)

    # Digit capsules layer
    digitcaps = CapsuleLayer(num_capsule=10, dim_capsule=16, routings=3)(primarycaps_squashed)

    # Length layer for classification output
    out_caps = layers.Lambda(
        lambda x: tf.sqrt(tf.re

None

Building Autoencoder CNN model...


None

Building Capsule Network model...
CapsNet output layers: ['keras_tensor_482', 'keras_tensor_489']


None

Training Attention-Enhanced CNN model...
Epoch 1/30
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 14ms/step - accuracy: 0.7224 - loss: 0.8151 - val_accuracy: 0.8478 - val_loss: 0.4470 - learning_rate: 0.0010
Epoch 2/30
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 7ms/step - accuracy: 0.8723 - loss: 0.3583 - val_accuracy: 0.8196 - val_loss: 0.5592 - learning_rate: 9.9726e-04
Epoch 3/30
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - accuracy: 0.8984 - loss: 0.2830 - val_accuracy: 0.8971 - val_loss: 0.2720 - learning_rate: 9.8907e-04
Epoch 4/30
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.9073 - loss: 0.2567 - val_accuracy: 0.8960 - val_loss: 0.2862 - learning_rate: 9.7553e-04
Epoch 5/30
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.9155 - loss: 0.2340 - val_accuracy: 0.9036 - val_loss: 0.2706 - learning_rate: 9.5677e-04
Epoc

  plt.legend()



Evaluating all models...

Evaluating Attention-Enhanced CNN model...
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9251 - loss: 0.3642
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step

Evaluating Autoencoder CNN model...
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step

Evaluating Capsule Network model...
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step

AE-CNN Classification Report:
              precision    recall  f1-score   support

 T-shirt/top       0.90      0.85      0.87      1000
     Trouser       0.99      0.99      0.99      1000
    Pullover       0.90      0.90      0.90      1000
       Dress       0.93      0.91      0.92      1000
        Coat       0.87      0.91      0.89      1000
      Sandal       0.99      0.98      0.99      1000
       Shirt       0.77      0.79      0.78      1000
     Sneaker       0.95      0.98      0.97      1000
   

<Figure size 1200x800 with 0 Axes>