# Vision Transformer

In [1]:
import torch
import torch.nn as nn

## Patch Embedings

In [4]:
class PatchEmbed(nn.Module):
    """Split image into patches and then embed them.
    
    Parameters
    ----------
    img_size : int
        size of the image (square image)
        
    patch_size : int
        size of the patch (square)
        
    in_chans : int
        Number of input channels.
        
    embed_dim : int
        The embedding dimension.
        This remains constant across the network.
        
    Attributes
    ----------
    n_patches : int
        Number of patches we split our image into.
        
    proj : nn.Conv2d
        Convolutional layer to split the image into patches.
        It does two things: splitting as well as the embedding.
        
    """
    
    def __init__(self, img_size, patch_size, in_chans=3, embed_dim=768):
        super().__init__()
        self.img_size = img_sizeself.patch_size = patch_size
        self.n_patches = (img_size // patch_size) **2  # we'll try that the image size is perfectly divisible by the patch size so that we completely cover the image
        
        # the attribute projection
        # we put both kernel_size and the stride as the patch_size
        # this way, when we are sliding the kernel along the input tensor, we will never overlap the kernel
        # the kernel will exactly fall into patches that we're trying to divide our image into
        self.proj = nn.Conv2d(
                    in_chans,
                    embed_dim,
                    kernel_size=patch_size,
                    stride=patch_size)
   
    def forward(self, x):
        """Run forward pass
        
        Parameters
        ----------
        x : torch.Tensor
            shape -> '(n_samples, in_chans, img_size, img_size)'
            
        Returns
        -------
        torch.Tensor
            shape -> '(n_samples, n_patches, embed_dim)'
            
        """
        # The input tensor is nothing but a batch of images
        # n_samples and batch size are synonymous
        # we'll be using n_samples across the entire code
        # because we're using pytorch, the channels are actually the 2nd dimension / 1st dimension in python terms
        # image size is the height and the width of our images
        # the otput is a three dimensional tensor 
        # the second dimension represents different patches that we divided into 
        # the last dimension will be the embedding dimension
        
        x = self.proj(x) # (n_smaples, embed_dim, n_patches ** 0.5, n_patches ** 0.5)
        # we run the input tensor through the convlayer and we will get a 4D tensor
        
        x = x.flatten(2) # (n_samples, embed_dim, n_patches)
        # we take the last two dimensions that represent the grid of patches and we flatten them into a single dimension
        
        x = x.transpose(1, 2) # (n_samples, n_patches, embed_dim)
        # we finally swap two dimensions
        
        return x

## Attention Module

In [19]:
class Attention(nn.Module):
    """Attention mechanism.
    
    Parameters
    ----------
    dim : int
        The input and output dimension of per token features
        
    n_heads : int
        Number of attention heads
        
    qkv_bias : bool
        If True then we include bias to the query, key and value projections.
        
    attn_p : float
        Dropout probability applied to the query, key and value tensors.
        
    proj_p : float
        Dropout probabilty applied to the output tensor.
        
    Attributes
    ----------
    scale : float
        Normalizing constant for the dot product
        
    qkv : nn.Linear
        Linear projection for the query, key and value.
        
    proj : nn.Linear
        Linear mapping that takes the concatenated output of all attention heads and maps it into a new space.
        
    attn_drop, proj_drop : nn.Dropout
        Dropout layers.
    """
    # we provide the embeding dimension and we will set things up in a way that our input dim of the tokens is equal to the output dimension.
    # n_heads is another hyperparameter related to the attention mechanism
    # qkv_bias will determine if we want to include the bias in the query, key and value proj
    # we will be running the network only in the inference mode so we don't need any dropout
    # but doupout helps with overfitting
    
    # internally we save a scale factor and it will be used to normalize the dot product
    # the linear mapping can be split up into three seperate ones : key, value, query
    # proj is another linear layer and is the last step of the attention mechanism
    
    def __init__(self, dim, n_heads=12, qkv_bias=True, attn_p=0., proj_p=0.):
        super().__init__()
        self.n_heads = n_heads
        self.dim = dim
        self.head_dim = dim // n_heads
        # here we define the dimensionality for each of the heads
        # the reason for setting it up in this way is that once we concatenate all the attention heads, we'll get a new tensor that will have the same dimensionality as the input.
        
        self.scale = self.head_dim ** -0.5
        # the scale is coming from the attentionisallyouneed paper
        # the idea behind it is not to feed extremely large values into the softmax which could lead into small gradients
        
        self.qkv = nn.Linear(dim, dim*3, bias=qkv_bias)
        # here we create a linear mapping that is going to take in a token embedding and generate a query, key and a value
        # we can also write three seperate linear mappings that are more or less doing the same thing
        
        self.attn_drop = nn.Dropout(attn_p)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_p)
        # here we define two dropout layers 
        # define a linear mapping that takes the concatenated heads and maps them to a new space.
        
    def forward(self, x):
        """Run forward pass.
        
        Parameters
        ----------
        x : torch.Tensor
            shape -> '(n_samples, n_patches + 1, dim)'.
            
        Returns
        -------
        torch.Tensor
           
           shape -> '(n_samples, n_patches + 1, dim)'
        """
        # in the forward pass, the input and the output tensors are going to have the same shape
        # the second dimension is going to have a (size of the num patches + 1)
        # reason of inlcuding the '+ 1' being : we willalways have the class token as the first token in the sequence.
        
        n_samples, n_tokens, dim = x. shape
        
        # just checking whether the embedding dimension of the input is the same as the one we declared in the constructor.
        # SANITY CHECK
        if dim !=self.dim:
            raise ValueError
            
        # taking the input tensor and turning it into queries, keys and values
        qkv = self.qkv(x)  # (n_samples, n_patches+1, 3*dim)
        qkv = qkv.reshape(
                n_samples, n_tokens, 3, self.n_heads, self.head_dim
                )  # (n_samples, n_patches+1, 2, n_heads, head_dim)
        qkv = qkv.permute(2, 0, 3, 1, 4)  # (3, n_samples, n_heads, n_patches+1, head_dim)
        # in the reshape step we create an extra dimension for the heads and we also create an extra dimenion for the key, query and value
        # in the permute step we just change their order
        
        q, k, v = qkv[0], qkv[1], qkv[2]
        
        k_t = k.transpose(-2, -1) # (n_samples, n_heads, dead_dim, n_patches+1)
        # here we transpose our keys because we're getting ready to compute the dot product
        
        dp = (q@k_t) * self.scale # (n_samples, n_heads, n_patches+1, n_patches+1)
        # the dot product is possible because the last two dimensions of the two tensors is compatible.
        
        attn = dp.softmax(dim=-1) # (n_samples, n_heads, n_patches+1, n_patches+1)
        # applying softmax to the last dimension to create a discrete probability distribution that sums up to 1
        # this distribution can be used as weights in a weighted average
        attn = self.attn_drop(attn)
        
        weighted_avg = attn @ v # (n_samples, n_heads, n_patches+!, head_dim)
        weighted_avg = weighted_avg.transpose(1,2) # (n_samples, n_patches+1, n_heads, head_dim)
        weighted_avg = weighted_avg.flatten(2) #(n_samples, n_patches+1, dim)
        # the last two operations concatenate the attention heads
        # we end up with a 3 dimensional attention head that has exactly the dimensions that we want
        
        x = self.proj(weighted_avg) # (n_samples, n_patches+1, dim)
        x = self.proj_drop(x) # (n_samples, n_patches+1, dim)
        
        return x
        
        
        

In [22]:
class MLP(nn.Module):
    """Multilayer perceptron.
    
    Parameters
    ----------
    in_features : int
        Number of input features.
        
    hidden_features : int
        Number of nodes in the hidden layer
        
    out_features : int
        Number of output features.
        
    p : float
        Dropout probability.
        
    Attribute
    ---------
    fc : nn.Linear
        The first linear layer
        
    act : nn.GELU
        GELU activation function
        
    fc2 : nn.Linear
        The second linear layer
        
    drop : nn.Dropout
        Dropout layer.
    """
    
    # the MLP has one hidden layer
    # we will be using the gaussian error linear unit activation function

    def __init__(self, in_features, hidden_features, out_features, p=0.):
        super().__init__()
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = nn.GELU
        self.fc2 == nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(p)
        
    def forward(self, x):
        """Run forward pass.
        
        Parameters
        ----------
        x : torch.Tensor
            shape -> '(n_samples, n_patches + 1, in_features)'.
            
        Returns
        -------
        torch.Tensor
           
           shape -> '(n_samples, n_patches + 1, out_features)'
        """
        
        # similar to the attention block, we're going to be applying the linear mapping to a 3 dimensional tensor
        x = self.fc1(x) # (n_samples, n_patches+1, hidden_features)
        x = self.act(x) # (n_samples, n_patches+1, hidden_features)
        x = self.drop(x) # (n_samples, n_patches+1, hidden_features)
        x = self.fc2(x) # (n_samples, n_patches+1, hidden_features)
        x = self.drop(x) # (n_samples, n_patches+1, hidden_features)
        
        return x
        

## putting things together

In [47]:
class Block(nn.Module):
    """Transformer block.

    Parameters
    ----------
    dim : int
        Embeddinig dimension.

    n_heads : int
        Number of attention heads.

    mlp_ratio : float
        Determines the hidden dimension size of the `MLP` module with respect
        to `dim`.

    qkv_bias : bool
        If True then we include bias to the query, key and value projections.

    p, attn_p : float
        Dropout probability.

    Attributes
    ----------
    norm1, norm2 : LayerNorm
        Layer normalization.

    attn : Attention
        Attention module.

    mlp : MLP
        MLP module.
    """
    
    # for attributes we will have two normalization layers, 1 attention module and 1 MLP module
    # we instantiate the first layer normalization and we set epsillon = 10^-6
    # so as to match the pretrained model 
    
    def __init__(self, dim, n_heads, mlp_ratio=4.0, qkv_bias=True, p=0., attn_p=0.):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim, eps=1e-6)
        self.attn = Attention(
                dim,
                n_heads=n_heads,
                qkv_bias=qkv_bias,
                attn_p=attn_p,
                proj_p=p
        )
        self.norm2 = nn.LayerNorm(dim, eps=1e-6)
        hidden_features = int(dim * mlp_ratio)
        # we define the absolute value of the hidden_features
        self.mlp = MLP(
                in_features=dim,
                hidden_features=hidden_features,
                out_features=dim,
        )    

    
    def forward(self, x):
        """Run forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Shape `(n_samples, n_patches + 1, dim)`.

        Returns
        -------
        torch.Tensor
            Shape `(n_samples, n_patches + 1, dim)`.
        """
        x = x + self.attn(self.norm1(x))
        x = x + self.mlp(self.norm2(x))
        # creating a residual block so we can take the original input tensor and we add to it a new tensor
        # this new tensor is created by applying the layer norm and the attention.
        
        # the second tensor is created by applying the second layer normalization and the multi-laper perceptron
        # we are using two seperate layer norm modules and both of them will have their seperate set of parameters

        return x


In [None]:
class VisionTransformer(nn.Module):
    """Simplified implementation of the Vision transformer.

    Parameters
    ----------
    img_size : int
        Both height and the width of the image (it is a square).

    patch_size : int
        Both height and the width of the patch (it is a square).

    in_chans : int
        Number of input channels.

    n_classes : int
        Number of classes.

    embed_dim : int
        Dimensionality of the token/patch embeddings.

    depth : int
        Number of blocks.

    n_heads : int
        Number of attention heads.

    mlp_ratio : float
        Determines the hidden dimension of the `MLP` module.

    qkv_bias : bool
        If True then we include bias to the query, key and value projections.

    p, attn_p : float
        Dropout probability.

    Attributes
    ----------
    patch_embed : PatchEmbed
        Instance of `PatchEmbed` layer.

    cls_token : nn.Parameter
        Learnable parameter that will represent the first token in the sequence.
        It has `embed_dim` elements.

    pos_emb : nn.Parameter
        Positional embedding of the cls token + all the patches.
        It has `(n_patches + 1) * embed_dim` elements.

    pos_drop : nn.Dropout
        Dropout layer.

    blocks : nn.ModuleList
        List of `Block` modules.

    norm : nn.LayerNorm
        Layer normalization.
    """
    def __init__(
            self,
            img_size=384,
            patch_size=16,
            in_chans=3,
            n_classes=1000,
            embed_dim=768,
            depth=12,
            n_heads=12,
            mlp_ratio=4.,
            qkv_bias=True,
            p=0.,
            attn_p=0.,
    ):
        super().__init__()

        self.patch_embed = PatchEmbed(
                img_size=img_size,
                patch_size=patch_size,
                in_chans=in_chans,
                embed_dim=embed_dim,
        )
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(
                torch.zeros(1, 1 + self.patch_embed.n_patches, embed_dim)
        )
        self.pos_drop = nn.Dropout(p=p)

        self.blocks = nn.ModuleList(
            [
                Block(
                    dim=embed_dim,
                    n_heads=n_heads,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    p=p,
                    attn_p=attn_p,
                )
                for _ in range(depth)
            ]
        )

        self.norm = nn.LayerNorm(embed_dim, eps=1e-6)
        self.head = nn.Linear(embed_dim, n_classes)


    def forward(self, x):
        """Run the forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Shape `(n_samples, in_chans, img_size, img_size)`.

        Returns
        -------
        logits : torch.Tensor
            Logits over all the classes - `(n_samples, n_classes)`.
        """
        n_samples = x.shape[0]
        x = self.patch_embed(x)

        cls_token = self.cls_token.expand(
                n_samples, -1, -1
        )  # (n_samples, 1, embed_dim)
        x = torch.cat((cls_token, x), dim=1)  # (n_samples, 1 + n_patches, embed_dim)
        x = x + self.pos_embed  # (n_samples, 1 + n_patches, embed_dim)
        x = self.pos_drop(x)

        for block in self.blocks:
            x = block(x)

        x = self.norm(x)

        cls_token_final = x[:, 0]  # just the CLS token
        x = self.head(cls_token_final)

        return x

### dropout layer extras

In [5]:
import torch

p = 0.5

module = torch.nn. Dropout(p)

module.training

True

In [6]:
# by default the dropout module is set to the training mode

inp = torch.ones(3, 5)

module(inp)

tensor([[2., 2., 2., 2., 2.],
        [2., 2., 0., 2., 0.],
        [0., 0., 2., 2., 0.]])

In [7]:
module(inp)

tensor([[0., 0., 0., 2., 0.],
        [0., 0., 0., 0., 2.],
        [0., 0., 0., 0., 2.]])

In [8]:
# each forward pass will remove approx 50% of the elements and set them to 0
# however to makeup for this removal, it will multiply the remaining elements with the constant
# 1 / (1-p)
1 / (1-p)

2.0

In [10]:
# Let us now set the module to evaluation mode
module.eval()


Dropout(p=0.5, inplace=False)

In [11]:
module.training

False

In [12]:
# we see that internally this training boolean got set to false
module(inp)

tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]])

In [13]:
module(inp)

tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]])

In [14]:
module(inp)

tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]])

In [None]:
# as we can see, that in evaluation mode, the dropout layer behaves exactly like an identity mapping

### linear layer behavior in case of 3 or higher dimensional tensors

In [15]:
import torch

module = torch.nn.Linear(10, 20)

module

Linear(in_features=10, out_features=20, bias=True)

In [16]:
n_samples = 40
# the most common way how to use the linear layer is to give it a two-dimensional input
# first dimenion equals to the samples or the batch
# second dimension is equal to the input features that we declared in the constructor.
inp_2d = torch.rand(n_samples, 10)

module(inp_2d).shape

torch.Size([40, 20])

In [17]:
# as we can see for each sample, the linear layer simply took the intput features and mapped them into the output features
# however you can use the linear layer on tensors of arbitrary dimension, higher than 2 as well
# we only need to make sure that the input tensor's last dimension is equal to the input features that you declared in the constructor
inp_3d = torch.rand(n_samples, 33, 10)

module(inp_3d).shape

torch.Size([40, 33, 20])

In [18]:
# the output tensor was created by applying the linear layer across all the samples and across the entire second dimenion

inp_7d = torch.rand(n_samples, 2, 3, 4, 5, 6, 10)

module(inp_7d).shape

torch.Size([40, 2, 3, 4, 5, 6, 20])

### basic properties of the layer norm

In [28]:
import torch

inp = torch.tensor([[0, 4.0], [-1, 7], [3, 5]])
# tensor with three samples and two features

n_samples, n_features = inp.shape

module = torch.nn.LayerNorm(n_features, elementwise_affine=False)
# instantiates layer norm with elementwise_affine = false
# this way there won't be any learnable parameters.

sum(p.numel() for p in module.parameters() if p.requires_grad)

0

In [29]:
# let us now compute the mean and the standard devaiation for each sample of our input
inp.mean(-1), inp.std(-1, unbiased=False)

(tensor([2., 3., 4.]), tensor([2., 4., 1.]))

In [38]:
# the layer norm will use these to normalize the data and will do this for each sample

module(inp).mean(-1), module(inp).std(-1, unbiased=False)

(tensor([2.0000, 2.0000, 2.0000], grad_fn=<MeanBackward1>),
 tensor([16.0000, 16.0000, 15.9999], grad_fn=<StdBackward0>))

In [39]:
# the layer norm has made sure that the mean and the standard deviation is 0 and 1 for each sample respectively
# this process is independent for different samples
# the batch size doesnt play any role

# let us now reinstantiate the module with elementwise_affine=True
module = torch.nn.LayerNorm(n_features, elementwise_affine=True)

sum(p.numel() for p in module.parameters() if p.requires_grad)

4

In [40]:
# the model now actually has 4 parameters that are contained in the weights and the biases of the module
# they represent the new per feature mean and standard deviation
# we will use that to rescale the data

(module.bias, module.weight)

(Parameter containing:
 tensor([0., 0.], requires_grad=True),
 Parameter containing:
 tensor([1., 1.], requires_grad=True))

In [41]:
module(inp).mean(-1), module(inp).std(-1, unbiased=False)

(tensor([ 0.0000e+00, -2.9802e-08,  1.1921e-07], grad_fn=<MeanBackward1>),
 tensor([1.0000, 1.0000, 1.0000], grad_fn=<StdBackward0>))

In [42]:
# upon running the forward pass it seems that nothing changed if we caompare it to the elementwise_affin=False
# However this time around we actually did 2 things:
# first we used the normalization as before and then we used our learnable parameters to rescale the data
# the parameters are initialized in a way that make it seem like the second step never happened 
# the parameters would get learned during training or we can just manually change them here

module.bias.data += 1
module.weight.data *= 4
 
module(inp).mean(-1), module(inp).std(-1, unbiased=False)

(tensor([1.0000, 1.0000, 1.0000], grad_fn=<MeanBackward1>),
 tensor([4.0000, 4.0000, 4.0000], grad_fn=<StdBackward0>))

In [43]:
# after updating the parameters the forward pass returns different tensors
# which makes it clear that the second rescaling step is actually taking place

# the input tensor can have any arbitrary number of dimensions as long as the last dimesnion = no. of features

module(torch.rand(n_samples, 2, 3, 4, 5, 6, n_features)).shape

torch.Size([3, 2, 3, 4, 5, 6, 2])

In [45]:
# It is always the last dimension that is being normalized
module(torch.rand(n_samples, 2, 3, 4, 5, 6, n_features)).mean(-1)

tensor([[[[[[1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
            [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
            [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
            [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
            [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000]],

           [[1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
            [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
            [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
            [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
            [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000]],

           [[1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
            [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
            [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
            [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
            [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000]],

           [[1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
  

In [46]:
module(torch.rand(n_samples, 2, 3, 4, 5, 6, n_features)).std(-1, unbiased=False)

tensor([[[[[[3.9995, 3.9996, 3.9992, 3.9981, 3.9987, 3.9981],
            [3.9986, 3.8908, 3.9987, 3.9987, 3.9997, 3.9976],
            [3.9660, 3.9999, 3.9999, 3.9998, 2.6700, 3.9999],
            [3.9994, 3.9991, 3.9998, 3.9942, 3.0745, 3.9986],
            [3.9634, 3.9995, 3.9962, 3.9907, 3.9998, 3.9994]],

           [[3.9999, 3.9997, 3.9995, 3.9999, 3.9984, 3.9988],
            [3.9998, 3.9985, 3.9996, 3.9993, 3.9994, 3.9505],
            [3.9801, 3.9995, 3.9989, 3.9979, 3.9997, 3.9985],
            [3.9985, 3.9990, 3.9999, 3.9989, 3.9579, 3.9657],
            [3.9997, 3.9997, 3.9961, 3.9760, 3.9991, 3.9997]],

           [[3.9938, 3.9995, 3.9801, 3.9982, 3.9955, 3.9997],
            [3.9998, 3.9945, 3.9994, 3.9997, 3.9997, 3.9999],
            [3.9676, 3.9998, 3.9998, 3.9998, 3.9952, 3.9998],
            [3.9957, 3.9914, 3.9035, 3.9996, 3.9999, 3.9996],
            [3.9560, 3.9996, 3.9990, 3.9997, 3.9937, 3.9998]],

           [[3.9955, 3.9993, 3.9617, 3.9991, 3.9240, 3.9996],
  