# Vision Transformer (ViT) model

In [1]:
import tensorflow as tf
import os
import numpy as np
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing import image
import matplotlib.pyplot as plt




In [2]:
# Define Vision Transformer Model
def create_vit_model(input_shape, num_classes, patch_size=16, projection_dim=64, transformer_layers=4, num_heads=4):
    inputs = layers.Input(shape=input_shape)
    
    # Patching
    patches = layers.Conv2D(filters=projection_dim, kernel_size=patch_size, strides=patch_size, padding='valid')(inputs)
    patches_flat = layers.Reshape((-1, projection_dim))(patches)

    # Positional Encoding
    positions = tf.range(start=0, limit=patches_flat.shape[1], delta=1)
    position_embedding = layers.Embedding(input_dim=patches_flat.shape[1], output_dim=projection_dim)(positions)
    patches_encoded = patches_flat + position_embedding

    # Transformer Layers
    for _ in range(transformer_layers):
        # Multi-Head Attention
        attention_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=projection_dim)(patches_encoded, patches_encoded)
        attention_output = layers.Add()([patches_encoded, attention_output])  # Residual
        attention_output = layers.LayerNormalization()(attention_output)

        # Feed-Forward Network
        ff_output = layers.Dense(2 * projection_dim, activation="relu")(attention_output)
        ff_output = layers.Dense(projection_dim)(ff_output)
        patches_encoded = layers.Add()([attention_output, ff_output])  # Residual
        patches_encoded = layers.LayerNormalization()(patches_encoded)

    # Classification Head
    representation = layers.GlobalAveragePooling1D()(patches_encoded)
    outputs = layers.Dense(num_classes, activation="softmax")(representation)
    return Model(inputs, outputs)

# Function to Train the Model
def train_vit(train_dir, test_dir, input_shape, num_classes, batch_size=32, epochs=10, model_path='vit_model.h5'):
    # Data Generators
    train_datagen = ImageDataGenerator(rescale=1./255)
    test_datagen = ImageDataGenerator(rescale=1./255)

    train_generator = train_datagen.flow_from_directory(
        train_dir,
        target_size=input_shape[:2],
        batch_size=batch_size,
        class_mode='categorical'
    )
    test_generator = test_datagen.flow_from_directory(
        test_dir,
        target_size=input_shape[:2],
        batch_size=batch_size,
        class_mode='categorical'
    )

    # Create Model
    model = create_vit_model(input_shape, num_classes)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Train Model
    model.fit(train_generator, validation_data=test_generator, epochs=epochs)
    model.save(model_path)

# Function to Predict Test Images
def predict_test_images(test_dir, model_path, input_shape):
    test_datagen = ImageDataGenerator(rescale=1./255)
    test_generator = test_datagen.flow_from_directory(
        test_dir,
        target_size=input_shape[:2],
        batch_size=1,
        class_mode='categorical',
        shuffle=False
    )
    model = load_model(model_path)
    predictions = model.predict(test_generator)
    predicted_classes = np.argmax(predictions, axis=1)
    true_classes = test_generator.classes

    # Print Accuracy
    accuracy = np.mean(predicted_classes == true_classes)
    print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Function to Predict a User-Provided Image
def predict_single_image(image_path, model_path, input_shape, class_indices):
    model = load_model(model_path)
    img = image.load_img(image_path, target_size=input_shape[:2])
    img_array = image.img_to_array(img) / 255.0
    img_array = np.expand_dims(img_array, axis=0)

    predictions = model.predict(img_array)
#     print(f"predictions: {predictions}")
    predicted_class = np.argmax(predictions, axis=1)[0]
#     print(f"predicted_class: {predicted_class}")

    # Map predicted class index to class name
    # class_labels = {v: k for k, v in class_indices.items()}
    # print(f"class_labels: {class_labels}")
    print(f"Predicted Class: {class_indices[predicted_class]}")


In [8]:
source_dataset_path = os.path.join('..', 'Dataset', 'CUB_KMean_Dataset', 'images')
destination_dataset_path = os.path.join('..', 'Dataset', 'CUB_KMean_Dataset', 'ViT_data')

In [3]:
# # Split dataset
# import splitfolders

# source_dataset_path = os.path.join('..', 'Dataset', 'CUB_KMean_Dataset', 'images')
# destination_dataset_path = os.path.join('..', 'Dataset', 'CUB_KMean_Dataset', 'ViT_data')

# splitfolders.ratio(source_dataset_path,
#                    destination_dataset_path,
#                    seed = 41,
#                    ratio=(0.7, 0.2, 0.1)) # train/val/test

In [4]:
# if __name__ == "__main__":
#     # Define Parameters
#     input_shape = (224, 224, 3)
#     train_dir = os.path.join(destination_dataset_path, 'train')
#     test_dir = os.path.join(destination_dataset_path, 'val')
#     num_classes = len(os.listdir(train_dir))
#     model_path = os.path.join('..', 'Models', 'vit_model.h5')

#     # Train Model
#     train_vit(train_dir, test_dir, input_shape, num_classes, epochs=100, model_path=model_path)

#     # Predict Test Images
#     predict_test_images(test_dir, model_path, input_shape)

#     # Predict Single Image
#     class_indices = {i: class_name for i, class_name in enumerate(os.listdir(train_dir))}
#     image_path = os.path.join('..', 'Dataset', 'CUB_KMean_Dataset', 'ViT_data', 'test', 'Acadian_Flycatcher', "Acadian_Flycatcher_0014_795607.jpg") 
#     predict_single_image(image_path, model_path, input_shape, class_indices)

Note: Above part is working.

## Part 2

In [5]:
import tensorflow as tf
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, cohen_kappa_score, confusion_matrix, accuracy_score, precision_recall_fscore_support
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing import image

In [6]:
# Define Vision Transformer Model
def create_vit_model(input_shape, num_classes, patch_size=16, projection_dim=64, transformer_layers=4, num_heads=4):
    inputs = tf.keras.layers.Input(shape=input_shape)

    # Patching
    patches = tf.keras.layers.Conv2D(filters=projection_dim, kernel_size=patch_size, strides=patch_size, padding='valid')(inputs)
    patches_flat = tf.keras.layers.Reshape((-1, projection_dim))(patches)

    # Positional Encoding
    positions = tf.range(start=0, limit=patches_flat.shape[1], delta=1)
    position_embedding = tf.keras.layers.Embedding(input_dim=patches_flat.shape[1], output_dim=projection_dim)(positions)
    patches_encoded = patches_flat + position_embedding

    # Transformer Layers
    for _ in range(transformer_layers):
        # Multi-Head Attention
        attention_output = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=projection_dim)(patches_encoded, patches_encoded)
        attention_output = tf.keras.layers.Add()([patches_encoded, attention_output])  # Residual
        attention_output = tf.keras.layers.LayerNormalization()(attention_output)

        # Feed-Forward Network
        ff_output = tf.keras.layers.Dense(2 * projection_dim, activation="relu")(attention_output)
        ff_output = tf.keras.layers.Dense(projection_dim)(ff_output)
        patches_encoded = tf.keras.layers.Add()([attention_output, ff_output])  # Residual
        patches_encoded = tf.keras.layers.LayerNormalization()(patches_encoded)

    # Classification Head
    representation = tf.keras.layers.GlobalAveragePooling1D()(patches_encoded)
    outputs = tf.keras.layers.Dense(num_classes, activation="softmax")(representation)
    return tf.keras.Model(inputs, outputs)

# Function to Train the Model
def train_vit(train_dir, test_dir, input_shape, num_classes, batch_size=32, epochs=10, model_path='vit_model.h5'):
    # Data Generators
    train_datagen = ImageDataGenerator(rescale=1./255, 
                                       horizontal_flip=True, 
                                       height_shift_range=0.1, 
                                       width_shift_range=0.1, 
                                       brightness_range=(0.5,1.5), 
                                       zoom_range = [1, 1.5])
    test_datagen = ImageDataGenerator(rescale=1./255)

    train_generator = train_datagen.flow_from_directory(
        train_dir,
        target_size=input_shape[:2],
        batch_size=batch_size,
        class_mode='categorical'
    )
    test_generator = test_datagen.flow_from_directory(
        test_dir,
        target_size=input_shape[:2],
        batch_size=batch_size,
        class_mode='categorical'
    )

    # Create Model
    # model = create_vit_model(input_shape, num_classes)
    model = create_vit_model(input_shape, num_classes, patch_size=16, projection_dim=196, transformer_layers=16, num_heads=4)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Train Model
    history = model.fit(train_generator, validation_data=test_generator, epochs=epochs)

    # Save Model
    model.save(model_path)

    # Save Plots
    plot_training_metrics(history, 'training_metrics.png')

# Function to Plot Training Metrics
def plot_training_metrics(history, save_path):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    plt.plot(epochs, acc, label='Training Accuracy')
    plt.plot(epochs, val_acc, label='Validation Accuracy')
    plt.title('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(epochs, loss, label='Training Loss')
    plt.plot(epochs, val_loss, label='Validation Loss')
    plt.title('Loss')
    plt.legend()

    plt.savefig(save_path)
    plt.close()

# Function to Evaluate the Model and Generate Metrics
def evaluate_model(test_dir, model_path, input_shape, class_indices):
    test_datagen = ImageDataGenerator(rescale=1./255)
    test_generator = test_datagen.flow_from_directory(
        test_dir,
        target_size=input_shape[:2],
        batch_size=1,
        class_mode='categorical',
        shuffle=False
    )
    model = load_model(model_path)
    predictions = model.predict(test_generator)
    predicted_classes = np.argmax(predictions, axis=1)
    true_classes = test_generator.classes
    class_labels = {v: k for k, v in class_indices.items()}

    # Metrics
    accuracy = accuracy_score(true_classes, predicted_classes)
    precision, recall, f1, _ = precision_recall_fscore_support(true_classes, predicted_classes, average='weighted')
    kappa = cohen_kappa_score(true_classes, predicted_classes)
    cm = confusion_matrix(true_classes, predicted_classes)

    # Print Metrics
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print(f"Cohen's Kappa: {kappa:.2f}")

    # Plot Confusion Matrix
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt="d", xticklabels=class_labels.values(), yticklabels=class_labels.values())
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.savefig("confusion_matrix.png")
    plt.close()

    return accuracy, precision, recall, f1, kappa


In [9]:
if __name__ == "__main__":
    # Define Parameters
    input_shape = (224, 224, 3)
    train_dir = os.path.join(destination_dataset_path, 'train')
    test_dir = os.path.join(destination_dataset_path, 'val')
    num_classes = len(os.listdir(train_dir))
    model_path = os.path.join('..', 'Models', 'vit_model_2.h5')

    # Train Model
    train_vit(train_dir, test_dir, input_shape, num_classes, epochs=100, model_path=model_path)

    # Evaluate Model
    class_indices = {i: class_name for i, class_name in enumerate(os.listdir(train_dir))}
    evaluate_model(test_dir, model_path, input_shape, class_indices)
    
    # Predict Single Image
    class_indices = {i: class_name for i, class_name in enumerate(os.listdir(train_dir))}
    image_path = os.path.join('..', 'Dataset', 'CUB_KMean_Dataset', 'ViT_data', 'test', 'Acadian_Flycatcher', "Acadian_Flycatcher_0014_795607.jpg") 
    predict_single_image(image_path, model_path, input_shape, class_indices)

Found 4146 images belonging to 101 classes.
Found 1172 images belonging to 101 classes.



Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100

KeyboardInterrupt: 