# Assignment 3



*   Akshay Kumar
*   CS23MTECH11022




## Question 1

1. Self-Attention for Object Recognition with CNNs: Implement a sample CNN with one or
more self-attention layer(s) for performing object recognition over CIFAR-10 dataset. You have to
implement the self-attention layer yourself and use it in the forward function defined by you. All
other layers (fully connected, nonlinearity, conv layer, etc.) can be bulit-in implementations. The
network can be a simpler one (e.g., it may have 1x Conv, 4x [Conv followed by SA], 1x Conv, and
1x GAP). Please refer to the reading material provided here or any other similar one.

In [1]:
# Importing required libraries to load the CIFAR-10 dataset.
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# Convert it into a PyTorch tensor, and normalize it for each channel.
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Loading CIFAR-10 Dataset.
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

# PyTorch utility that helps with efficient data loading.
training_data = DataLoader(train_dataset, batch_size=64, shuffle=True)
testing_data = DataLoader(test_dataset, batch_size=64, shuffle=False)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:12<00:00, 13168225.55it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Defining the Self-Attention Layer
class SelfAttentionLayer(nn.Module):
    def __init__(self, input_dimension):
        super(SelfAttentionLayer, self).__init__()
        # Query, Key, and Value convolutions for self-attention mechanism.
        self.query_convolution = nn.Conv2d(in_channels=input_dimension, out_channels=input_dimension//8, kernel_size=1)
        self.key_convolution = nn.Conv2d(in_channels=input_dimension, out_channels=input_dimension//8, kernel_size=1)
        self.value_convolution = nn.Conv2d(in_channels=input_dimension, out_channels=input_dimension, kernel_size=1)

        # Initializing learnable parameter for scaling the attention output.
        self.gamma_parameter = nn.Parameter(torch.zeros(1))

    # Forward pass.
    def forward(self, x):
        batch_size, channels, height, width = x.size()

        # Query, Key, and Value projections.
        projected_query = self.query_convolution(x).view(batch_size, -1, width*height).permute(0, 2, 1)
        projected_key = self.key_convolution(x).view(batch_size, -1, width*height)
        energy = torch.bmm(projected_query, projected_key)
        attention = F.softmax(energy, dim=-1)
        projected_value = self.value_convolution(x).view(batch_size, -1, width*height)

        # Calculating output of self-attention.
        output = torch.bmm(projected_value, attention.permute(0, 2, 1))
        output = output.view(batch_size, channels, height, width)
        output = self.gamma_parameter * output + x
        return output

# Defining CNN Architecture with Self-Attention.
class CNNWithSelfAttention(nn.Module):
    def __init__(self):
        super(CNNWithSelfAttention, self).__init__()

        # Convolutional layers and self-attention modules.
        self.convolution1 = nn.Conv2d(3, 32, 3, padding=1)
        self.sa1 = SelfAttentionLayer(32)
        self.convolution2 = nn.Conv2d(32, 64, 3, padding=1)
        self.sa2 = SelfAttentionLayer(64)
        self.convolution3 = nn.Conv2d(64, 128, 3, padding=1)
        self.sa3 = SelfAttentionLayer(128)
        self.convolution4 = nn.Conv2d(128, 256, 3, padding=1)
        self.sa4 = SelfAttentionLayer(256)
        self.convolution5 = nn.Conv2d(256, 10, 3, padding=1)
        self.global_average_pooling = nn.AdaptiveAvgPool2d(1)

    def forward(self, x):

        # Forward pass through the network
        x = F.relu(self.convolution1(x))
        x = self.sa1(x)
        x = F.relu(self.convolution2(x))
        x = self.sa2(x)
        x = F.relu(self.convolution3(x))
        x = self.sa3(x)
        x = F.relu(self.convolution4(x))
        x = self.sa4(x)
        x = F.relu(self.convolution5(x))
        x = self.global_average_pooling(x)
        x = x.view(-1, 10)
        return F.log_softmax(x, dim=-1)

# Instantiating the model.
model = CNNWithSelfAttention()

# Defining loss function and optimizer.
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    # Iterating over the training dataset.
    for input_data, labels_data in training_data:
        input_data, labels_data = input_data.to(device), labels_data.to(device)
        optimizer.zero_grad()
        outputs = model(input_data)
        loss = loss_function(outputs, labels_data)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * input_data.size(0)

    # Calculating loss in each epoch and printing it.
    epoch_loss = running_loss / len(training_data.dataset)
    print(f"Loss: {epoch_loss:.4f}, Epoch [{epoch+1}]")

# Testing loop
model.eval()
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    # Iterating over the testing dataset
    for input_data, labels_data in testing_data:
        input_data, labels_data = input_data.to(device), labels_data.to(device)
        outputs = model(input_data)
        _, predicted_labels = torch.max(outputs, 1)
        total_predictions += labels_data.size(0)
        correct_predictions += (predicted_labels == labels_data).sum().item()

# Calculating accuracy and printing it.
accuracy = correct_predictions / total_predictions
print(f"Accuracy on test dataset of CIFAR 10 dataset: {accuracy:.2%}")

Loss: 2.2039, Epoch [1]
Loss: 1.7772, Epoch [2]
Loss: 1.2344, Epoch [3]
Loss: 1.0336, Epoch [4]
Loss: 0.9048, Epoch [5]
Loss: 0.8108, Epoch [6]
Loss: 0.7366, Epoch [7]
Loss: 0.6769, Epoch [8]
Loss: 0.6208, Epoch [9]
Loss: 0.5793, Epoch [10]
Accuracy on test dataset of CIFAR 10 dataset: 73.25%


In [5]:
# Counting the number of parameters
total_params_cnn_sa = sum(p.numel() for p in model.parameters())
print("Total parameters in CNN with Self-Attention model:", total_params_cnn_sa)

Total parameters in CNN with Self-Attention model: 520870


## Question 2

2. Object Recognition with Vision Transformer: Implement and train an Encoder only Trans-
former (ViT-like) for the above object recognition task. In other words, implement multi-headed self-attention for the image classification (i.e., appending a < class > token to the image patches
that are accepted as input tokens). Compare the performance of the two implementations (try to
keep the number of parameters to be comparable and use the same amount of training and testing
data).

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Multi-Head Self-Attention Module.
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, input_dim, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        assert input_dim % num_heads == 0, "Input dimension must be divisible by the number of heads."
        self.input_dim = input_dim
        self.num_heads = num_heads
        self.head_dim = input_dim // num_heads

        # Applying linear transformations for queries, keys, and values.
        self.query = nn.Linear(input_dim, input_dim)
        self.key = nn.Linear(input_dim, input_dim)
        self.value = nn.Linear(input_dim, input_dim)

        # Final linear transformation
        self.fc_out = nn.Linear(input_dim, input_dim)

    def forward(self, x):
        batch_size, seq_len, _ = x.size()

        # Applying linear transformation and splitting into multiple heads.
        Q = self.query(x).view(batch_size, seq_len, self.num_heads, self.head_dim)
        K = self.key(x).view(batch_size, seq_len, self.num_heads, self.head_dim)
        V = self.value(x).view(batch_size, seq_len, self.num_heads, self.head_dim)

        # Finding ranspose for batch matrix multiplication.
        Q = Q.permute(0, 2, 1, 3)
        K = K.permute(0, 2, 1, 3)
        V = V.permute(0, 2, 1, 3)

        # Calculating attention scores
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.head_dim
        attention = F.softmax(energy, dim=-1)

        # Finding weighted sum of values
        x = torch.matmul(attention, V)
        x = x.permute(0, 2, 1, 3).contiguous()
        x = x.view(batch_size, seq_len, self.input_dim)

        # Final linear transformation
        x = self.fc_out(x)
        return x

# Defining Transformer Encoder Block
class TransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_dim, dropout=0.1):
        super(TransformerEncoderBlock, self).__init__()
        self.self_attention = MultiHeadSelfAttention(embed_dim, num_heads)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.ReLU(),
            nn.Linear(mlp_dim, embed_dim)
        )

        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x_att = self.dropout(self.self_attention(x))
        # finding layer normalization and feed forward.
        x = self.norm1(x + x_att)
        x_ff = self.dropout(self.feed_forward(x))

        # Residual connection and layer normalization
        x = self.norm2(x + x_ff)
        return x

# Define Vision Transformer
class VisionTransformer(nn.Module):
    def __init__(self, image_size, patch_size, num_classes, embed_dim, num_heads, mlp_dim, num_layers, dropout=0.1):
        super(VisionTransformer, self).__init__()
        assert image_size % patch_size == 0, "Image dimensions must be divisible by the patch size."
        num_patches = (image_size // patch_size) ** 2

        # Patch embedding and positional embedding
        self.patch_embedding = nn.Conv2d(3, embed_dim, kernel_size=patch_size, stride=patch_size)
        self.positional_embedding = nn.Parameter(torch.randn(1, num_patches + 1, embed_dim))
        self.dropout = nn.Dropout(dropout)

        # Transformer encoder blocks
        self.transformer_encoder_blocks = nn.ModuleList([
            TransformerEncoderBlock(embed_dim, num_heads, mlp_dim, dropout)
            for _ in range(num_layers)
        ])

        # Class token
        self.class_token = nn.Parameter(torch.randn(1, 1, embed_dim))

        # Fully connected layer for classification
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        # Extracting the dimensions of the input tensor x.
        B, C, H, W = x.shape

        # Patch embedding: applying a convolution operation to extract image patches
        #and embedding them into a lower-dimensional space.
        x = self.patch_embedding(x).flatten(2).transpose(1, 2)  # (B, embed_dim, num_patches)

        # Adding a class token to the embedded patches.
        class_token = self.class_token.expand(B, -1, -1)
        x = torch.cat([class_token, x], dim=1)

        # Adding positional embeddings to the embedded patches.
        x += self.positional_embedding
        x = self.dropout(x)

        # Iterate through transformer encoder blocks
        for transformer_encoder_block in self.transformer_encoder_blocks:
            x = transformer_encoder_block(x)

        # Extracting class token
        x = x[:, 0]
        x = self.fc(x)
        return F.log_softmax(x, dim=-1)


# Initalizing the ViT model.
image_size = 32
patch_size = 16
num_classes = 10
embed_dim = 128
num_heads = 8
mlp_dim = 256
num_layers = 3
dropout = 0.1

model = VisionTransformer(image_size, patch_size, num_classes, embed_dim, num_heads, mlp_dim, num_layers, dropout)

# Define loss function and optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    # Iterating over the training dataset.
    for input_data, labels_data in training_data:
        input_data, labels_data = input_data.to(device), labels_data.to(device)
        optimizer.zero_grad()
        outputs = model(input_data)
        loss = loss_function(outputs, labels_data)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * input_data.size(0)

    # Calculating loss in each epoch and printing it.
    epoch_loss = running_loss / len(training_data.dataset)
    print(f"Loss: {epoch_loss:.4f}, Epoch [{epoch+1}]")


model.eval()
correct_predictions = 0
total_predictions = 0

# Testing loop
with torch.no_grad():
    # Iterating over the testing dataset
    for input_data, labels_data in testing_data:
        input_data, labels_data = input_data.to(device), labels_data.to(device)
        outputs = model(input_data)
        _, predicted_labels = torch.max(outputs, 1)
        total_predictions += labels_data.size(0)
        correct_predictions += (predicted_labels == labels_data).sum().item()

# Calculating accuracy and printing it.
accuracy = correct_predictions / total_predictions
print(f"Accuracy on test dataset of CIFAR 10 dataset: {accuracy:.2%}")

Loss: 1.7013, Epoch [1]
Loss: 1.4873, Epoch [2]
Loss: 1.3998, Epoch [3]
Loss: 1.3399, Epoch [4]
Loss: 1.2924, Epoch [5]
Loss: 1.2506, Epoch [6]
Loss: 1.2189, Epoch [7]
Loss: 1.1862, Epoch [8]
Loss: 1.1566, Epoch [9]
Loss: 1.1307, Epoch [10]
Accuracy on test dataset of CIFAR 10 dataset: 55.37%


Accuracy on test dataset of CIFAR 10 in Object Recognition with Vision Transformer is ~55% which is low as compared to Object Recognition with CNNs and Self-Attention where the accuracy on the same test dataset is ~73%.

One of the reason for the same may be because of more number of convolution layers used in the CNN network.

In [3]:
# Count the number of parameters
total_params_vit = sum(p.numel() for p in model.parameters())
print("Total parameters in ViT model:", total_params_vit)

Total parameters in ViT model: 497930


Total number of parameters in the ViT model is 497930 which is  not exactly the same as in CNN with Self-Attention model which is 520870 parameters, but it is nearly comparable. When I tried to increase the number of parameters from this current scenario in ViT model then the accuracy drops suddenly.