<center><img src="picture.jpg" width="600" height="500" /></center>

# Vit With Details

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt

In [None]:
# Define the number of classes in the CIFAR-100 dataset.
num_classes = 100

# Define the input shape for the images.
input_shape = (32, 32, 3)

# Load the CIFAR-100 dataset into training and testing sets.
(x_train, y_train), (x_test, y_test) = keras.datasets.cifar100.load_data()

# Print the shapes of the training and testing data and labels.
print(f"x_train shape: {x_train.shape} - y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape} - y_test shape: {y_test.shape}")

# Create an array 'y' filled with zeros, where each row corresponds to a training sample,
# and the column corresponding to the class label is set to 1 for one-hot encoding.
y = np.zeros((y_train.shape[0], num_classes), dtype='float32')

for i in range(y_train.shape[0]):
    # Set the element at the i-th row and y_train[i][0]-th column to 1 for one-hot encoding.
    y[i, y_train[i][0]] = 1

# Update 'y_train' to hold the one-hot encoded labels.
y_train = y


In [None]:
# Define a function to create a custom Vision Transformer (ViT) classifier model.
def create_vit_classifier(input_shape):
    # Define the input layer with the specified input shape.
    inputs = layers.Input(shape=input_shape)
    
    
    # Patch Extraction
    patch_size = 4  # Define the patch size for patch extraction.
    batch_size = tf.shape(inputs)[0]  # Get the batch size from the input tensor.
    
    
    # Extract patches from the input images.
    patches = tf.image.extract_patches(
        images=inputs,
        sizes=[1, patch_size, patch_size, 1],
        strides=[1, patch_size, patch_size, 1],
        rates=[1, 1, 1, 1],
        padding="VALID",
    )

    
    
    patch_dims = patches.shape[-1]  # Get the dimension of the extracted patches.
    

    # Reshape the extracted patches.
    patches = tf.reshape(patches, [batch_size, -1, patch_dims])


    #####################################################################
    # Patch Encoding
    num_patches = 64  # Define the number of patches.
    projection_dim = 64  # Define the dimension of the projection.

    # Generate positional embeddings using an embedding layer.
    positions = tf.range(start=0, limit=num_patches, delta=1)
    emd = layers.Embedding(input_dim=num_patches, output_dim=projection_dim)(positions)
    

    # Apply a dense layer to encode the patches and add positional embeddings.
    dens = layers.Dense(units=projection_dim)(patches)
    encoded_patches = emd + dens

    ###########################################################################

    ################################# Transformer Layers ###############################################################
    projection_dim = 64  # Redefine the projection dimension.
    transformer_units = 32  # Define the number of units in the transformer layers.
    num_heads = 2  # Define the number of attention heads.
    dropout_rate = 0.1  # Define the dropout rate.

    ######################Layer1##############################################################
    # Layer normalization.
    x1_1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)

    # Create a multi-head attention layer.
    attention_output = layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=projection_dim, dropout=0.1
    )(x1_1, x1_1)

    # Skip connection 1.
    x2_1 = layers.Add()([attention_output, encoded_patches])

    # Layer normalization 2.
    x3_1 = layers.LayerNormalization(epsilon=1e-6)(x2_1)

    # MLP.
    x3_1 = layers.Dense(128, activation=tf.nn.gelu)(x3_1)
    x3_1 = layers.Dropout(dropout_rate)(x3_1)
    x3_1 = layers.Dense(64, activation=tf.nn.gelu)(x3_1)
    x3_1 = layers.Dropout(dropout_rate)(x3_1)

    # Skip connection 2.
    encoded_patches = layers.Add()([x3_1, x2_1])

    ######################Layer2##############################################################
    # (Similar structure as Layer1, but with different variable names)
    
    ######################Layer3##############################################################
    # (Similar structure as Layer1, but with different variable names)

    # Create a [batch_size, projection_dim] tensor.
    representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
    representation = layers.Flatten()(representation)
    representation = layers.Dropout(0.5)(representation)

    # Add MLP layers for further feature processing.
    features = layers.Dense(2048, activation=tf.nn.gelu)(representation)
    features = layers.Dropout(dropout_rate)(features)
    features = layers.Dense(1024, activation=tf.nn.gelu)(features)
    features = layers.Dropout(dropout_rate)(features)

    # Classify outputs with a dense layer and softmax activation.
    num_classes = 100  # Define the number of classes for classification.
    logits = layers.Dense(num_classes, activation='softmax')(features)

    # Create the Keras model with specified inputs and outputs.
    model = keras.Model(inputs=inputs, outputs=logits)
    
    return model


model = create_vit_classifier(input_shape)

In [None]:
# Define the input shape for the model.
input_shape = (32, 32, 3)

# Create the ViT classifier model by calling the 'create_vit_classifier' function with the specified input shape.
model = create_vit_classifier(input_shape)

# Print a summary of the model's architecture.
model.summary()


In [None]:
# Define batch size, number of epochs, and learning rate.
batch_size = 64
num_epochs = 10
learning_rate = 0.0005

# Define the optimizer (Adam optimizer) with the specified learning rate.
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)

# Compile the model with the optimizer, loss function (BinaryCrossentropy), and metrics (accuracy).
model.compile(
    optimizer=optimizer,
    loss=keras.losses.BinaryCrossentropy(),
    metrics='accuracy'
)

# Train the model on the training data.
history = model.fit(
    x=x_train,                  # Training data features (input images)
    y=y_train,                  # Training data labels (one-hot encoded)
    batch_size=batch_size,      # Number of samples per batch
    epochs=num_epochs,          # Number of training epochs
    validation_split=0.1       # Fraction of training data to use for validation
)
