In [None]:
import random
import os
import glob
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers, Sequential
from tensorflow.keras.utils import plot_model

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (classification_report, precision_recall_fscore_support,
                             accuracy_score, top_k_accuracy_score, f1_score, 
                             matthews_corrcoef, confusion_matrix, ConfusionMatrixDisplay)
from scikitplot.metrics import plot_roc

In [None]:
class CFG:
    EPOCHS = 5
    BATCH_SIZE = 32
    SEED = 42
    TF_SEED = 768
    HEIGHT = 224
    WIDTH = 224
    CHANNELS = 3
    IMAGE_SIZE = (224, 224, 3)

<a id='1'></a>
# 1 | Dataset Exploration
<div style='border-width: 2px;
              border-bottom-width:4px;
              border-bottom-color:#B6EADA;
              border-bottom-style: solid;'></div>
<br>  

In [None]:
# Define paths
DATASET_PATH = "/kaggle/input/indian-birds/Birds_25/"
TRAIN_PATH = '/kaggle/input/indian-birds/Birds_25/train/'
TEST_PATH = '/kaggle/input/indian-birds/Birds_25/valid/'

<a id='1.1'></a>
### Get image paths with glob

In [None]:
%%time
train_images = glob.glob(f"{TRAIN_PATH}**/*.jpg")
test_images = glob.glob(f"{TEST_PATH}**/*.jpg")

<a id='1.2'></a>
### View the number of images present in the dataset

In [None]:
# Get train & test set sizes
train_size = len(train_images)
test_size = len(test_images)

# Get dataset size
total = train_size + test_size

# View samples counts
print(f'train samples count:\t\t{train_size}')
print(f'test samples count:\t\t{test_size}')
print('=======================================')
print(f'TOTAL:\t\t\t\t{total}')

<a id='1.3'></a>
### Create Pandas DataFrames for paths and labels

In [None]:
def generate_labels(image_paths):
    return [(_.split('/')[-2:][0]).replace('-', '_') for _ in image_paths]


def build_df(image_paths, labels):
    # Create dataframe
    df = pd.DataFrame({
        'image_path': image_paths,
        'label': generate_labels(labels)
    })
    
    # Shuffle and return df
    return df.sample(frac=1, random_state=CFG.SEED).reset_index(drop=True)

In [None]:
# Build the DataFrames
train_df = build_df(train_images, generate_labels(train_images))
test_df = build_df(test_images, generate_labels(test_images))

In [None]:
# View first 5 samples in the training set
train_df.head(5)

<a id=1.4></a>
### Label Encode Image Labels

In [None]:
# Generate Label Encoder
label_encoder = LabelEncoder()

# Label Encode the Image Labels
train_df['label_encoded'] = label_encoder.fit_transform(train_df.label)
test_df['label_encoded'] = label_encoder.transform(test_df.label)

# View first 10 samples
train_df.head(10)

In [None]:
# Get class names and number of classes from label_encoder
num_classes = len(label_encoder.classes_)
class_names = label_encoder.classes_

print(f'Number of classes: {num_classes}')
print(f'Classes: {class_names}')

<a id='1.5'></a>
### Load & View Random Sample Image

In [None]:
def _load(image_path):
    # Read and decode an image file to a uint8 tensor
    image = tf.io.read_file(image_path)
    image = tf.io.decode_jpeg(image, channels=3)
    
    # Resize image
    image = tf.image.resize(image, [CFG.HEIGHT, CFG.WIDTH],
                            method=tf.image.ResizeMethod.LANCZOS5)
    
    # Convert image dtype to float32 and NORMALIZE!!!
    image = tf.cast(image, tf.float32)/255.
    
    # Return image
    return image

def view_sample(image, label, color_map='rgb', fig_size=(8, 10)):
    plt.figure(figsize=fig_size)
    
    if color_map=='rgb':
        plt.imshow(image)
    else:
        plt.imshow(tf.image.rgb_to_grayscale(image), cmap=color_map)
    
    plt.title(f'Label: {label}', fontsize=16)
    return

In [None]:
# Select random sample from train_df
idx = random.sample(train_df.index.to_list(), 1)[0]

# Load the random sample and label
sample_image, sample_label = _load(train_df.image_path[idx]), train_df.label[idx]

# View the random sample
view_sample(sample_image, sample_label)

<a id=1.6></a>
### View Multiple Randomly Selected Samples

In [None]:
def view_mulitiple_samples(df, sample_loader, count=10, color_map='rgb', fig_size=(14, 10)):
    rows = count//5
    if count%5 > 0:
        rows +=1
    
    idx = random.sample(df.index.to_list(), count)    
    fig = plt.figure(figsize=fig_size)

    for column, _ in enumerate(idx):
        plt.subplot(rows, 5, column+1)
        plt.title(f'Label: {df.label[_]}')
        
        if color_map=='rgb':
            plt.imshow(sample_loader(df.image_path[_]))
        else:
            plt.imshow(tf.image.rgb_to_grayscale(sample_loader(df.image_path[_])), cmap=color_map)
    
    return

view_mulitiple_samples(
    train_df, _load, 
    count=25, fig_size=(20, 24)
)

<a id='1.7'></a>
### View Train Labels Distribution

In [None]:
fig, (ax1, ax2) = plt.subplots(2, figsize=(14, 10))

# Set the spacing between subplots
fig.tight_layout(pad=6.0)

# Plot Train Labels Distribution
ax1.set_title('Train Labels Distribution', fontsize=20)
train_distribution = train_df['label'].value_counts().sort_values()
sns.barplot(x=train_distribution.values,
            y=list(train_distribution.keys()),
            orient="h",
            ax=ax1)

# Plot Test Labels Distribution
ax2.set_title('Test Labels Distribution', fontsize=20)
test_distribution = test_df['label'].value_counts().sort_values()
sns.barplot(x=test_distribution.values,
            y=list(test_distribution.keys()),
            orient="h",
            ax=ax2);

<div class="alert alert-block alert-info">
    <h3>Observe</h3>
    We see that both the training and test sets are <code>perfectly balanced</code>. However, we'll need to create a validation set for hyperparameter tuning.
</div>

<center><div style='color:#ffffff;
           display:inline-block;
           padding: 5px 5px 5px 5px;
           border-radius:5px;
           background-color:#B6EADA;
           font-size:100%;'><a href=#toc style='text-decoration: none; color:#03001C;'>⬆️ Back To Top</a></div></center>

<a id='2'></a>
# 2 | Data Preprocessing: Building An Input Data Pipeline


<a id='2.1'></a>
### Create Train & Test Splits

In [None]:
# Create Train/Val split with Training Set
train_split_idx, val_split_idx, _, _ = train_test_split(train_df.index, 
                                                        train_df.label_encoded, 
                                                        test_size=0.20,
                                                        stratify=train_df.label_encoded,
                                                        random_state=CFG.SEED)

In [None]:
# Get new training and validation data
train_new_df = train_df.iloc[train_split_idx].reset_index(drop=True)
val_df = train_df.iloc[val_split_idx].reset_index(drop=True)

# View shapes
train_new_df.shape, val_df.shape

In [None]:
train_df.shape

In [None]:
train_size = len(train_new_df)
val_size = len(val_df)
test_size = len(test_df)
total = train_size + val_size + test_size

# View the counts
print(f'train samples count:\t\t{train_size}\t({(100 * train_size/total):.2f}%)')
print(f'validation samples count:\t{val_size}\t({(100 * val_size/total):.2f}%)')
print(f'test samples count:\t\t{test_size}\t({(100 * test_size/total):.2f}%)')
print('================================================')
print(f'TOTAL:\t\t\t\t{total}\t({(100 * total/total):.2f}%)')

<a id='2.2'></a>
### View New Train & Validation Labels Distribution

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(3, figsize=(16, 24))

# Set the spacing between subplots
fig.tight_layout(pad=6.0)

# Plot Train Labels Distribution
ax1.set_title('Train Labels Distribution', fontsize=20)
train_distribution = train_new_df['label'].value_counts().sort_values()
sns.barplot(x=train_distribution.values,
            y=list(train_distribution.keys()),
            orient="h",
            ax=ax1);

# Plot Validation Labels Distribution
ax2.set_title('Validation Labels Distribution', fontsize=20)
val_distribution = val_df['label'].value_counts().sort_values()
sns.barplot(x=val_distribution.values,
            y=list(val_distribution.keys()),
            orient="h",
            ax=ax2);

# Plot Test Labels Distribution
ax3.set_title('Test Labels Distribution', fontsize=20)
test_distribution = test_df['label'].value_counts().sort_values()
sns.barplot(x=test_distribution.values,
            y=list(test_distribution.keys()),
            orient="h",
            ax=ax3);

<a id='2.3'></a>
### Create an Image Data Augmentation Layer

In [None]:
# Build augmentation layer
augmentation_layer = Sequential([
    layers.RandomFlip(mode='horizontal_and_vertical', seed=CFG.TF_SEED),
    layers.RandomZoom(height_factor=(-0.1, 0.1), width_factor=(-0.1, 0.1), seed=CFG.TF_SEED),
], name='augmentation_layer')

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 10))

# Set the spacing between subplots
fig.tight_layout(pad=6.0)

# View Original Image
ax1.set_title('Original Image', fontsize=20)
ax1.imshow(sample_image);

# View Augmented Image
ax2.set_title('Augmented Image', fontsize=20)
ax2.imshow(augmentation_layer(sample_image));

<div class="alert alert-block alert-info">
    <h3>Observe</h3>
    The augmentation layer generates augmentations that are slightly different to the original images. This is intended as we aim to generate augmented images which are similar to that of the original dataset images while preserving the images key features. 
</div>

<a id='2.4'></a>
### Create Input Data Pipeline w. tf.data API

In [None]:
def encode_labels(labels, encode_depth=25):
    return tf.one_hot(labels, depth=encode_depth).numpy()

def create_pipeline(df, load_function, augment=False, batch_size=32, shuffle=False, cache=None, prefetch=False):
    '''
    Generates an input pipeline using the tf.data API given a Pandas DataFrame and image loading function.
    
    @params
        - df: (pd.DataFrame) -> DataFrame containing paths and labels 
        - load_function: (function) -> function used to load images given their paths
        - augment: (bool) -> condition for applying augmentation
        - batch_size: (int) -> size for batched (default=32) 
        - shuffle: (bool) -> condition for data shuffling, data is shuffled when True (default=False)
        - cache: (str) -> cache path for caching data, data is not cached when None (default=None)
        - prefetch: (bool) -> condition for prefeching data, data is prefetched when True (default=False)
        
    @returns
        - dataset: (tf.data.Dataset) -> dataset input pipeline used to train a TensorFlow model
    '''
    # Get image paths and labels from DataFrame
    image_paths = df.image_path
    image_labels = encode_labels(df.label_encoded)
    AUTOTUNE = tf.data.AUTOTUNE
    
    # Create dataset with raw data from DataFrame
    ds = tf.data.Dataset.from_tensor_slices((image_paths, image_labels))
    
    # Map augmentation layer and load function to dataset inputs if augment is True
    # Else map only the load function
    if augment:
        ds = ds.map(lambda x, y: (augmentation_layer(load_function(x)), y), num_parallel_calls=AUTOTUNE)
    else:
        ds = ds.map(lambda x, y: (load_function(x), y), num_parallel_calls=AUTOTUNE)
    
    # Apply shuffling based on condition
    if shuffle:
        ds = ds.shuffle(buffer_size=1000)
        
    # Apply batching
    ds = ds.batch(batch_size)
    
    # Apply caching based on condition
    # Note: Use cache in memory (cache='') if the data is small enough to fit in memory!!!
    if cache != None:
        ds = ds.cache(cache)
    
    # Apply prefetching based on condition
    # Note: This will result in memory trade-offs
    if prefetch:
        ds = ds.prefetch(buffer_size=AUTOTUNE)
    
    # Return the dataset
    return ds

In [None]:
# Generate Train Input Pipeline
train_ds = create_pipeline(train_new_df, _load, augment=False, 
                           batch_size=CFG.BATCH_SIZE, 
                           shuffle=False, prefetch=True)

# Generate Validation Input Pipeline
val_ds = create_pipeline(val_df, _load, 
                         batch_size=CFG.BATCH_SIZE, 
                         shuffle=False, prefetch=False)

# Generate Test Input Pipeline
test_ds = create_pipeline(test_df, _load, 
                          batch_size=CFG.BATCH_SIZE, 
                          shuffle=False, prefetch=False)

In [None]:
# View string representation of datasets
print('========================================')
print('Train Input Data Pipeline:\n\n', train_ds)
print('========================================')
print('Validation Input Data Pipeline:\n\n', val_ds)
print('========================================')
print('Test Input Data Pipeline:\n\n', test_ds)
print('========================================')

<center><div style='color:#ffffff;
           display:inline-block;
           padding: 5px 5px 5px 5px;
           border-radius:5px;
           background-color:#B6EADA;
           font-size:100%;'><a href=#toc style='text-decoration: none; color:#03001C;'>⬆️ Back To Top</a></div></center>
           
<a id="3"></a>
# 3 | Transfer Learning Model: EfficientNet V2 B0
<div style='border-width: 2px;
              border-bottom-width:4px;
              border-bottom-color:#B6EADA;
              border-bottom-style: solid;'></div>
<br>  

EfficientNetV2 is a convolutional neural network that has faster training speed and better parameter efficiency than previous EfficientNetV1 models. To develop these models, the authors use a combination of training-aware neural architecture search and scaling, to jointly optimize training speed. The models were searched from the search space enriched with new ops such as Fused-MBConv.

<center>
    <figure>
        <img src="https://i.postimg.cc/vBSTR3V0/59864ee4.png" alt ="MBConv and Fused-MBConv" style='width: 400px;'>
        <figcaption>
            Image Source: <a href="https://wandb.ai/wandb_fc/pytorch-image-models/reports/EfficientNetV2--Vmlldzo2NTkwNTQ">[Source]</a></figcaption>
    </figure>
</center>

**For more information follow the links below:**
> - EfficientNet V2 Paper (2021): [EfficientNetV2: Smaller Models and Faster Training](https://arxiv.org/pdf/2104.00298v2.pdf)
> - [EfficientNet: Improving Accuracy and Efficiency through AutoML and Model Scaling](https://ai.googleblog.com/2019/05/efficientnet-improving-accuracy-and.html)
> - [Wandb: EfficientNetV2](https://wandb.ai/wandb_fc/pytorch-image-models/reports/EfficientNetV2--Vmlldzo2NTkwNTQ)

<a id="tfhub"></a>
## TensorFlow Hub

TensorFlow Hub is a repository of trained machine learning models ready for fine-tuning and deployable anywhere. Reuse trained models like BERT and Faster R-CNN with just a few lines of code.

In this section we'll be using pre-trained models from TensorFlow Hub. 

> For more information on TensorFlow Hub or if you would like to access the same models in PyTorch/JAX, check out the following links:
> - [TensorFlow Hub](https://www.tensorflow.org/hub)
> - [HuggingFace🤗](https://huggingface.co/)

In [None]:
# Here's a function to get any model/preprocessor from tensorflow hub
def get_tfhub_model(model_link, model_name, model_trainable=False):
    return hub.KerasLayer(model_link,
                          trainable=model_trainable,
                          name=model_name)

<a id="3.1"></a>
### Get EfficientNet From TensorFlow Hub

In [None]:
# Get EfficientNet V2 B0 here
efficientnet_v2_url = 'https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet21k_b0/feature_vector/2'
model_name = 'efficientnet_v2_b0'

# Set trainable to False for inference-only 
set_trainable=False

efficientnet_v2_b0 = get_tfhub_model(efficientnet_v2_url, 
                                     model_name, 
                                     model_trainable=set_trainable)

<a id="3.2"></a>
### Define EfficientNet Model

In [None]:
def efficientnet_v2_model():
    
    initializer = tf.keras.initializers.GlorotNormal(seed=CFG.SEED)
    
    efficientnet_v2_sequential = Sequential([
        layers.Input(shape=CFG.IMAGE_SIZE, dtype=tf.float32, name='input_image'),
        efficientnet_v2_b0,
        layers.Dropout(0.2),
        layers.Dense(512, activation='relu', kernel_initializer=initializer),
        layers.Dense(256, activation='relu', kernel_initializer=initializer),
        layers.Dense(25, dtype=tf.float32, activation='softmax', kernel_initializer=initializer)
    ], name='efficientnet_v2_sequential_model')
    
    return efficientnet_v2_sequential

In [None]:
# Generate Model
model_efficientnet_v2 = efficientnet_v2_model()

# Generate Summary of the Model
model_efficientnet_v2.summary()

In [None]:
# Explore model visually
plot_model(
    model_efficientnet_v2, dpi=60,
    show_shapes=True
)

<a id="3.3"></a>
### Train EfficientNet Model

To train this model we'll use Categorical Crossentropy as the loss function since this notebook approaches the problem at hand as a classification problem for multiple labels. As for the optimizer, we'll use the Adam optimizer with 0.001 as the (default) learning rate. 

To prevent the occurance of overfitting during training we'll have to make use of TensorFlow's Callback API to implement the EarlyStopping & ReduceLROnPlateau callbacks. The only metrics we'll track during the training of the model will be the loss and accuracy metrics.

**See the following for more information:**
>- **Categorical Crossentropy Loss Function:**
>    - [Understanding Categorical Cross-Entropy Loss, Binary Cross-Entropy Loss, Softmax Loss, Logistic Loss, Focal Loss and all those confusing names](https://gombru.github.io/2018/05/23/cross_entropy_loss/)
>    - [TensorFlow Categorical Crossentropy Loss Implementation](https://www.tensorflow.org/api_docs/python/tf/keras/losses/CategoricalCrossentropy)
>- **Adam Optimizer:**
>    - [Academic Paper | Adam: A Method for Stochastic Optimization](https://arxiv.org/abs/1412.6980)
>    - [TensorFlow Adam Implementation](https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Adam)
>- **TensorFlow Callback API:**
>    - [EarlyStopping Implementation](https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping)
>    - [ReduceLROnPlateau Implementation](https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ReduceLROnPlateau)
>- **TensorFlow Metrics:**
>    - [TensorFlow Metrics Overview](https://www.tensorflow.org/api_docs/python/tf/keras/metrics)

In [None]:
def train_model(model, num_epochs, callbacks_list, tf_train_data, 
                tf_valid_data=None, shuffling=False):
    '''
        Trains a TensorFlow model and returns a dict object containing the model metrics history data. 
        
        @params
        - model: (tf.keras.model) -> model to be trained 
        - num_epochs: (int) -> number of epochs to train the model
        - callbacks_list: (list) -> list containing callback fuctions for model
        - tf_train_data: (tf.data.Dataset) -> dataset for model to be train on 
        - tf_valid_data: (tf.data.Dataset) -> dataset for model to be validated on (default=None)
        - shuffling: (bool) -> condition for data shuffling, data is shuffled when True (default=False)
        
        @returns
        - model_history: (dict) -> dictionary containing loss and metrics values tracked during training
    '''
    
    model_history = {}
    
    if tf_valid_data != None:
        model_history = model.fit(tf_train_data,
                                  epochs=num_epochs,
                                  validation_data=tf_valid_data,
                                  validation_steps=int(len(tf_valid_data)),
                                  callbacks=callbacks_list,
                                  shuffle=shuffling)
        
    if tf_valid_data == None:
        model_history = model.fit(tf_train_data,
                                  epochs=num_epochs,
                                  callbacks=callbacks_list,
                                  shuffle=shuffling)
    return model_history

In [None]:
# Define Early Stopping Callback
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    patience=3, 
    restore_best_weights=True)

# Define Reduce Learning Rate Callback
reduce_lr_callback = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    patience=2,
    factor=0.1,
    verbose=1)

# Define Callbacks and Metrics lists
CALLBACKS = [early_stopping_callback, reduce_lr_callback]
METRICS = ['accuracy']

In [None]:
tf.random.set_seed(CFG.SEED)

# Compile the model
model_efficientnet_v2.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=METRICS
)

# Train the model 
print(f'Training {model_efficientnet_v2.name}.')
print(f'Train on {len(train_df)} samples, validate on {len(val_df)} samples.')
print('----------------------------------')

efficientnet_v2_history = train_model(
    model_efficientnet_v2, CFG.EPOCHS, CALLBACKS, 
    train_ds, val_ds,
    shuffling=False
)

In [None]:
# Evaluate the model
efficientnet_v2_evaluation = model_efficientnet_v2.evaluate(test_ds)

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# Get true labels and predictions
y_true = np.concatenate([y.numpy() for _, y in test_ds], axis=0)  # True labels
y_pred_probs = model_efficientnet_v2.predict(test_ds)  # Predicted probabilities
y_pred = np.argmax(y_pred_probs, axis=1)  # Convert probabilities to class indices
y_true = np.argmax(y_true, axis=1)  # Convert one-hot encoded labels to class indices

# Generate Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)

# Plot Confusion Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Efficientnet_v2_b0 onfusion Matrix")
plt.show()

# Print Classification Report
print("Efficientnet_v2_b0 Classification Report:\n", classification_report(y_true, y_pred, target_names=class_names))


In [None]:
# Generate model probabilities and associated predictions
efficientnet_v2_test_probabilities = model_efficientnet_v2.predict(test_ds, verbose=1)
efficientnet_v2_test_predictions = tf.argmax(efficientnet_v2_test_probabilities, axis=1)

<center><div style='color:#ffffff;
           display:inline-block;
           padding: 5px 5px 5px 5px;
           border-radius:5px;
           background-color:#B6EADA;
           font-size:100%;'><a href=#toc style='text-decoration: none; color:#03001C;'>⬆️ Back To Top</a></div></center>

<a id='4'></a>
# 4 | Transfer Learning Model: Vision Transformer
<div style='border-width: 2px;
              border-bottom-width:4px;
              border-bottom-color:#B6EADA;
              border-bottom-style: solid;'></div>
<br>              

<center>
    <figure>
        <img src="https://substackcdn.com/image/fetch/w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F9c096bed-e908-49da-a716-f8a78de952a3_1358x892.png" alt ="Vision Transformer" style='width:60%;'>
        <figcaption>
            Image Source: <a href="https://cameronrwolfe.substack.com/p/vision-transformers">[Source]</a></figcaption>
    </figure>
</center>

The Vision Transformer (ViT) model was introduced in a conference research paper titled "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale," published at The International Conference on Learning Representations (ICLR) in 2021. ViTs are used in image recognition tasks such as object detection, image segementation, action recognition and image classification.


In comparison to traditional Convolution Neural Network (CNN) Architechtures, Vision Transformers achive remarkable results while requiring less computational resources for pre-training. However, Vision Transformers exhibits weaker inductive bias which leads to an increased reliance on regularization techniques and data augmentation for smaller datasets. For this reason, CNNs generally perform better on smaller datasets while Vision Transformers excel on large datasets.

So for this section we will use the **pre-trained ViT-B32** model.

**For more information see the following:**
> - Vision Transformer Paper: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale (2021)](https://arxiv.org/pdf/2010.11929.pdf)
> - Attention Mechanism Paper: [Attention Is All You Need (2017)](https://arxiv.org/pdf/1706.03762.pdf)
> - V7Labs Article: [Vision Transformer: What It Is & How It Works [2023 Guide]](https://www.v7labs.com/blog/vision-transformer-guide)
> - Viso.ai Article: [Vision Transformers (ViT) in Image Recognition – 2022 Guide](https://viso.ai/deep-learning/vision-transformer-vit/)

In [None]:
!pip install -q vit-keras

<a id='4.1'></a>
### Get Vision Transformer Model

In [None]:
from vit_keras import vit

# Download the model
vit_model = vit.vit_b32(
        image_size=224,
        activation='softmax',
        pretrained=True,
        include_top=False,
        pretrained_top=False,
        classes=25)

# Freeze model layers for inference-mode only
for layer in vit_model.layers:
    layer.trainable = False

<a id='4.2'></a>
### Define Vision Transformer Model

In [None]:
def vit_b32_model():
    
    initializer = tf.keras.initializers.GlorotNormal(seed=CFG.SEED)
    
    vit_b32_sequential = Sequential([
        layers.Input(shape=CFG.IMAGE_SIZE, dtype=tf.float32, name='input_image'),
        vit_model,
        layers.Dropout(0.2),
        layers.Dense(512, activation='relu', kernel_initializer=initializer),
        layers.Dense(256, activation='relu', kernel_initializer=initializer),
        layers.Dense(25, dtype=tf.float32, activation='softmax', kernel_initializer=initializer)
    ], name='vit_b32_sequential_model')
    
    return vit_b32_sequential

In [None]:
# Generate Model
model_vit_b32 = vit_b32_model()

# Generate Summary of the Model
model_vit_b32.summary()

In [None]:
# Explore model visually
plot_model(
    model_vit_b32, dpi=60,
    show_shapes=True
)

<a id='4.3'></a>
### Train Vision Transformer Model

In [None]:
tf.random.set_seed(CFG.SEED)

# Compile the model
model_vit_b32.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=METRICS
)

# Train the model 
print(f'Training {model_vit_b32.name}.')
print(f'Train on {len(train_df)} samples, validate on {len(val_df)} samples.')
print('----------------------------------')

vit_b32_history = train_model(
    model_vit_b32, CFG.EPOCHS, CALLBACKS, 
    train_ds, val_ds,
    shuffling=False
)

In [None]:
# Evaluate the model
vit_b32_evaluation = model_vit_b32.evaluate(test_ds)

In [None]:
# Generate model probabilities and associated predictions
vit_b32_test_probabilities = model_vit_b32.predict(test_ds, verbose=1)
vit_b32_test_predictions = tf.argmax(vit_b32_test_probabilities, axis=1)

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# Get true labels and predictions
y_true = np.concatenate([y.numpy() for _, y in test_ds], axis=0)  # Extract true labels
y_pred_probs = model_vit_b32.predict(test_ds)  # Get model predictions (probabilities)
y_pred = np.argmax(y_pred_probs, axis=1)  # Convert probabilities to class indices
y_true = np.argmax(y_true, axis=1)  # Convert one-hot encoded labels to class indices

# Generate Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)

# Plot Confusion Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Vit_b32 confusion Matrix")
plt.show()

# Print Classification Report
print("Vit_b32 Classification Report:\n", classification_report(y_true, y_pred, target_names=class_names))


In [None]:
# Get EfficientNet V2 B3 here
efficientnet_v2_url_b3 = 'https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet21k_b3/feature_vector/2'
model_name_b3 = 'efficientnet_v2_b3'

# Set trainable to False for inference-only 
set_trainable = False

efficientnet_v2_b3 = get_tfhub_model(efficientnet_v2_url_b3, 
                                     model_name_b3, 
                                     model_trainable=set_trainable)

def efficientnet_v2_model_b3():
    initializer = tf.keras.initializers.GlorotNormal(seed=CFG.SEED)
    
    efficientnet_v2_sequential_b3 = Sequential([
        layers.Input(shape=CFG.IMAGE_SIZE, dtype=tf.float32, name='input_image'),
        efficientnet_v2_b3,
        layers.Dropout(0.2),
        layers.Dense(512, activation='relu', kernel_initializer=initializer),
        layers.Dense(256, activation='relu', kernel_initializer=initializer),
        layers.Dense(25, dtype=tf.float32, activation='softmax', kernel_initializer=initializer)
    ], name='efficientnet_v2_b3_sequential_model')
    
    return efficientnet_v2_sequential_b3

# Generate Model
model_efficientnet_v2_b3 = efficientnet_v2_model_b3()

# Compile the model
model_efficientnet_v2_b3.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=METRICS
)

# Train the model
print(f'Training {model_efficientnet_v2_b3.name}.')
efficientnet_v2_history_b3 = train_model(
    model_efficientnet_v2_b3, CFG.EPOCHS, CALLBACKS, 
    train_ds, val_ds,
    shuffling=False
)


In [None]:
# Evaluate the model
model_efficientnet_v2_b3_evaluation = model_efficientnet_v2_b3.evaluate(test_ds)

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# Get true labels and predictions
y_true = np.concatenate([y.numpy() for _, y in test_ds], axis=0)  # Extract true labels
y_pred_probs = model_efficientnet_v2_b3.predict(test_ds)  # Get model predictions (probabilities)
y_pred = np.argmax(y_pred_probs, axis=1)  # Convert probabilities to class indices
y_true = np.argmax(y_true, axis=1)  # Convert one-hot encoded labels to class indices

# Generate Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)

# Plot Confusion Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Efficientnet_v2_b3 Confusion Matrix")
plt.show()

# Print Classification Report
print("Efficientnet_v2_b3 Classification Report:\n", classification_report(y_true, y_pred, target_names=class_names))


In [None]:
# Import necessary libraries
import tensorflow as tf
import tensorflow_hub as hub  # Correct import for KerasLayer
from tensorflow.keras import Sequential, layers
from tensorflow.keras.utils import plot_model

# Get ResNet50 model URL from TensorFlow Hub
resnet50_url = 'https://tfhub.dev/tensorflow/resnet_50/feature_vector/1'
model_name_resnet50 = 'resnet_50'

# Set trainable to False for inference-only
set_trainable = False

# Load the ResNet50 model from TF Hub
def get_tfhub_model(url, model_name, model_trainable):
    return Sequential([
        layers.InputLayer(input_shape=CFG.IMAGE_SIZE, name='input_image'),
        hub.KerasLayer(url, trainable=model_trainable, name=model_name)  # Use hub.KerasLayer
    ])

resnet50_model = get_tfhub_model(resnet50_url, model_name_resnet50, set_trainable)

# Define ResNet model
def resnet_model():
    initializer = tf.keras.initializers.GlorotNormal(seed=CFG.SEED)
    
    resnet_sequential = Sequential([
        layers.Input(shape=CFG.IMAGE_SIZE, dtype=tf.float32, name='input_image'),
        resnet50_model,
        layers.Dropout(0.3),
        layers.Dense(512, activation='relu', kernel_initializer=initializer),
        layers.Dense(256, activation='relu', kernel_initializer=initializer),
        layers.Dense(25, activation='softmax', kernel_initializer=initializer)
    ], name='resnet_sequential_model')
    
    return resnet_sequential

# Generate Model
model_resnet = resnet_model()

# Generate Model Summary
model_resnet.summary()

# Visualize Model Structure
plot_model(
    model_resnet, dpi=60,
    show_shapes=True
)

# Compile the ResNet Model
model_resnet.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=METRICS
)

# Train the model
print(f'Training {model_resnet.name}.')
resnet_history = train_model(
    model_resnet, CFG.EPOCHS, CALLBACKS, 
    train_ds, val_ds,
    shuffling=False
)


In [None]:
# Evaluate the model
model_resnet_evaluation = model_resnet.evaluate(test_ds)

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# Get true labels and predictions
y_true = np.concatenate([y.numpy() for _, y in test_ds], axis=0)  # Extract true labels
y_pred_probs = model_resnet.predict(test_ds)  # Get model predictions (probabilities)
y_pred = np.argmax(y_pred_probs, axis=1)  # Convert probabilities to class indices
y_true = np.argmax(y_true, axis=1)  # Convert one-hot encoded labels to class indices

# Generate Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)

# Plot Confusion Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Resnet 50 Confusion Matrix")
plt.show()

# Print Classification Report
print("Resnet 50 Classification Report:\n", classification_report(y_true, y_pred, target_names=class_names))


In [None]:
# Import necessary libraries
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import Sequential, layers
from tensorflow.keras.utils import plot_model

# URL for ResNet-152 feature vector
resnet152_url = 'https://tfhub.dev/google/imagenet/resnet_v2_152/feature_vector/5'
model_name_resnet152 = 'resnet_v2_152'

# Set trainable to False for inference-only
set_trainable = False

# Load the ResNet-152 model from TF Hub
def get_tfhub_model(url, model_name, model_trainable):
    return Sequential([
        layers.InputLayer(input_shape=CFG.IMAGE_SIZE, name='input_image'),
        hub.KerasLayer(url, trainable=model_trainable, name=model_name)
    ])

resnet152_model = get_tfhub_model(resnet152_url, model_name_resnet152, set_trainable)

# Define the ResNet-152 model
def resnet152_model_func():
    initializer = tf.keras.initializers.GlorotNormal(seed=CFG.SEED)
    
    model = Sequential([
        layers.Input(shape=CFG.IMAGE_SIZE, dtype=tf.float32, name='input_image'),
        resnet152_model,
        layers.Dropout(0.3),
        layers.Dense(512, activation='relu', kernel_initializer=initializer),
        layers.Dense(256, activation='relu', kernel_initializer=initializer),
        layers.Dense(25, activation='softmax', kernel_initializer=initializer)
    ], name='resnet152_sequential_model')
    
    return model

# Generate the model
model_resnet152 = resnet152_model_func()

# Model Summary
model_resnet152.summary()

# Visualize Model Structure
plot_model(
    model_resnet152, dpi=60,
    show_shapes=True
)

# Compile the model
model_resnet152.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=METRICS
)

# Train the model
print(f'Training {model_resnet152.name}.')
resnet152_history = train_model(
    model_resnet152, CFG.EPOCHS, CALLBACKS, 
    train_ds, val_ds,
    shuffling=False
)


In [None]:
# Evaluate the model
model_resnet152_evaluation = model_resnet152.evaluate(test_ds)

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# Get true labels and predictions
y_true = np.concatenate([y.numpy() for _, y in test_ds], axis=0)  # Extract true labels
y_pred_probs = model_resnet152.predict(test_ds)  # Get model predictions (probabilities)
y_pred = np.argmax(y_pred_probs, axis=1)  # Convert probabilities to class indices
y_true = np.argmax(y_true, axis=1)  # Convert one-hot encoded labels to class indices

# Generate Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)

# Plot Confusion Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix for ResNet-152 Model")
plt.show()

# Print Classification Report
print("Classification Report for ResNet-152 Model:\n", classification_report(y_true, y_pred, target_names=class_names))


In [None]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import Model, layers, Input
from tensorflow.keras.utils import plot_model

# URLs for the models
resnet50_url = 'https://tfhub.dev/google/imagenet/resnet_v2_50/feature_vector/5'
efficientnet_b0_url = 'https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet21k_b0/feature_vector/2'
efficientnet_b3_url = 'https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet21k_b3/feature_vector/2'

# Function to create base models from TF Hub
def get_tfhub_model(url, input_shape, trainable=False):
    return hub.KerasLayer(url, input_shape=input_shape, trainable=trainable)

# Define input shape (CFG.IMAGE_SIZE should be (224, 224, 3))
input_shape = CFG.IMAGE_SIZE

# Create inputs for all models
input_layer = Input(shape=input_shape, name="ensemble_input")

# Load ResNet-50
resnet50_model = get_tfhub_model(resnet50_url, input_shape)
resnet50_output = resnet50_model(input_layer)

# Load EfficientNet V2 B0
efficientnet_b0_model = get_tfhub_model(efficientnet_b0_url, input_shape)
efficientnet_b0_output = efficientnet_b0_model(input_layer)

# Load EfficientNet V2 B3
efficientnet_b3_model = get_tfhub_model(efficientnet_b3_url, input_shape)
efficientnet_b3_output = efficientnet_b3_model(input_layer)

# Concatenate the outputs from all three models
concatenated = layers.Concatenate(name="concatenate_features")(
    [resnet50_output, efficientnet_b0_output, efficientnet_b3_output]
)

# Add fully connected layers for final classification
x = layers.Dropout(0.3)(concatenated)
x = layers.Dense(512, activation='relu', kernel_initializer='he_normal')(x)
x = layers.Dense(256, activation='relu', kernel_initializer='he_normal')(x)
output_layer = layers.Dense(25, activation='softmax', name="final_output")(x)

# Create the ensemble model
ensemble_model = Model(inputs=input_layer, outputs=output_layer, name="ResNet_EfficientNet_Ensemble")

# Compile the model
ensemble_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=tf.keras.losses.CategoricalCrossentropy(),
    metrics=['accuracy']
)

# Model Summary
ensemble_model.summary()

# Visualize the model structure
plot_model(ensemble_model, show_shapes=True, dpi=60)

# Train the model
ensemble_history = train_model(
    ensemble_model, CFG.EPOCHS, CALLBACKS, train_ds, val_ds, shuffling=False
)


In [None]:
# Evaluate the model
ensemble_model_evaluation = ensemble_model.evaluate(test_ds)

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# Get true labels and predictions
y_true = np.concatenate([y.numpy() for _, y in test_ds], axis=0)  # Extract true labels
y_pred_probs = ensemble_model.predict(test_ds)  # Get model predictions (probabilities)
y_pred = np.argmax(y_pred_probs, axis=1)  # Convert probabilities to class indices
y_true = np.argmax(y_true, axis=1)  # Convert one-hot encoded labels to class indices

# Generate Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)

# Plot Confusion Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix - resnet50 + efficientnet_b0 + b3")
plt.show()

# Print Classification Report
print("Classification Report resnet50 + efficientnet_b0 + b3:\n", classification_report(y_true, y_pred, target_names=class_names))


In [None]:
from tensorflow.keras.utils import plot_model

# Visualize the model structure
plot_model(ensemble_model, show_shapes=True, show_layer_names=True, dpi=60, to_file="ensemble_model.png")

In [None]:
import matplotlib.pyplot as plt

def plot_training_curves(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs_range = range(len(acc))

    plt.figure(figsize=(14, 5))

    # Accuracy Plot
    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, acc, label='Training Accuracy')
    plt.plot(epochs_range, val_acc, label='Validation Accuracy')
    plt.legend(loc='lower right')
    plt.title('Training and Validation Accuracy')

    # Loss Plot
    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, loss, label='Training Loss')
    plt.plot(epochs_range, val_loss, label='Validation Loss')
    plt.legend(loc='upper right')
    plt.title('Training and Validation Loss')

    plt.show()

# Call the function with your training history
plot_training_curves(ensemble_history)


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

def plot_confusion_matrix(y_true, y_pred, class_names):
    conf_matrix = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues",
                xticklabels=class_names, yticklabels=class_names)
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.title("Confusion Matrix")
    plt.show()

# Usage
plot_confusion_matrix(y_true, y_pred, class_names)


In [None]:
import pandas as pd

def plot_per_class_accuracy(y_true, y_pred, class_names):
    correct = (y_true == y_pred)
    total_per_class = np.bincount(y_true)
    correct_per_class = np.bincount(y_true, weights=correct)
    acc_per_class = correct_per_class / total_per_class

    plt.figure(figsize=(12, 6))
    sns.barplot(x=class_names, y=acc_per_class)
    plt.xticks(rotation=45, ha='right')
    plt.ylabel("Accuracy")
    plt.title("Per-Class Accuracy")
    plt.ylim(0, 1.0)
    plt.grid(axis='y')
    plt.tight_layout()
    plt.show()

plot_per_class_accuracy(y_true, y_pred, class_names)


In [None]:
import matplotlib.pyplot as plt

def show_sample_predictions(model, dataset, class_names, n=9):
    plt.figure(figsize=(12, 12))
    count = 0

    for images, labels in dataset.take(1):
        predictions = model.predict(images)
        pred_classes = np.argmax(predictions, axis=1)
        true_classes = np.argmax(labels, axis=1)

        for i in range(n):
            plt.subplot(3, 3, i + 1)
            plt.imshow(images[i].numpy().astype("uint8"))
            plt.axis("off")
            pred_label = class_names[pred_classes[i]]
            true_label = class_names[true_classes[i]]
            color = "green" if pred_label == true_label else "red"
            plt.title(f"True: {true_label}\nPred: {pred_label}", color=color)
            count += 1
            if count == n:
                break
    plt.tight_layout()
    plt.show()

# Call the function
show_sample_predictions(ensemble_model, test_ds, class_names)


In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from itertools import cycle
import matplotlib.pyplot as plt
import numpy as np

def plot_multiclass_roc(y_true, y_score, class_names):
    # Binarize the labels for multi-class ROC
    y_true_bin = label_binarize(y_true, classes=range(len(class_names)))
    n_classes = len(class_names)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Plot all ROC curves
    plt.figure(figsize=(12, 10))
    colors = cycle(plt.cm.tab20.colors)

    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                 label=f'Class {class_names[i]} (AUC = {roc_auc[i]:.2f})')

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Multi-class ROC Curve')
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.show()

# Usage
plot_multiclass_roc(y_true, y_pred_probs, class_names)
