In [1]:
import torch
import torchvision
import torchinfo

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
import requests
with open("helper_functions.py", "wb") as f:
    request =  requests.get("https://raw.githubusercontent.com/mrdbourke/pytorch-deep-learning/refs/heads/main/helper_functions.py")
    f.write(request.content)

In [None]:
from helper_functions import download_data
image_path = download_data(source = "https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip",
                           destination = "pizza_steak_sushi")
image_path 

In [None]:
image_path

In [None]:
train_dir = image_path / "train"
test_dir = image_path / "test"
train_dir, test_dir

In [None]:
from going_modular import data_setup

IMAGE_SIZE = 224
BATCH_SIZE = 32

#creatig transform pipeline
manual_transforms = torchvision.transforms.Compose([
    torchvision.transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    torchvision.transforms.ToTensor()
])

train_dataloader, test_dataloader, class_names = data_setup.create_dataloaders(train_dir=train_dir,
                                                                                test_dir = test_dir,
                                                                                batch_size = BATCH_SIZE,
                                                                                transform= manual_transforms)
len(train_dataloader), len(test_dataloader), class_names


In [None]:
# get a batch of image
image_batch, label_batch = next(iter(train_dataloader))
image, label = image_batch[0], label_batch[0]
image.shape, label


In [None]:
image_batch.shape, label_batch.shape

In [None]:
import matplotlib.pyplot as plt
plt.imshow(image.permute(1, 2, 0))
plt.title(class_names[label])
plt.axis(False)

In [None]:
#create example values
height = 224
width = 224
color_channels = 3
patch_size = 16

# calculate the number of patches
number_of_patches = int((height * width)/ patch_size ** 2)
number_of_patches

In [None]:
# input shape
embedding_layer_input_shape = (height, width, color_channels)
embedding_layer_output_shape = (number_of_patches, patch_size ** 2 * color_channels) #output shpae = (n , P^2*c)
print(f"Input shape (single 2D image): {embedding_layer_input_shape}")
print(f"Output shape (single 1D sequence of patches): {embedding_layer_output_shape} -> (number_of_patches, embedding_dimension)")

In [None]:
#equation : 1 - split data into patches and creating the class, position and patch embedding

In [None]:
plt.imshow(image.permute(1, 2, 0))
plt.title(class_names[label])
plt.axis(False);

In [None]:
image.shape

In [None]:
image_permuted = image.permute(1, 2, 0)

In [None]:
# plotting whole image as patches
img_size = 224
patch_size = 16
num_patches = img_size // patch_size
assert img_size % patch_size == 0, "Image size must be divisible by patch_size"
print(f"Number of patches per row: {num_patches} \
      \nNumber of patches per column : {num_patches} \
      \nTotal patches : {num_patches * num_patches} \
      \nPatch size : {patch_size} pixels x {patch_size} pixels")

# create a series of subplots
fig, axs = plt.subplots(nrows = img_size // patch_size,
                       ncols = img_size // patch_size,
                       figsize = (num_patches, num_patches),
                       sharex = True,
                       sharey = True)
# loop through height and width of image
for i, patch_height in enumerate(range(0, img_size, patch_size)):
    for j, patch_width in enumerate(range(0, img_size, patch_size)):
      axs[i, j].imshow(image_permuted[patch_height:patch_height + patch_size, # iterate through height
                                      patch_width:patch_width + patch_size, # iterate through width
                                      :]) # get all color channels
      axs[i, j].set_ylabel(i+1,
                           rotation = 'horizontal',
                           horizontalalignment = 'right',
                           verticalalignment = 'center')
      axs[i, j].set_xlabel(j + 1)
      axs[i, j].set_xticks([])
      axs[i, j].set_yticks([])
      axs[i, j].label_outer()

#set the title of the plot
fig.suptitle(f"{class_names[label]} -> Patchified", fontsize = 14)
plt.show()


In [None]:
from torch import nn
patch_size = 16

conv2d = nn.Conv2d(in_channels = 3, # for color images
                   out_channels = 768, #D size from table of paper or (p^2 * c ) 
                   kernel_size= patch_size,
                   stride = patch_size,
                   padding = 0)
conv2d

In [None]:
img_out_of_conv = conv2d(image.unsqueeze(dim = 0))
img_out_of_conv.shape


In [None]:
# plot random convolutional feature maps(embeddings)
import random
random_indexes = random.sample(range(0, 768), k = 5)
print(f"Showing random convolutional feature maps from indexes : {random_indexes}")
# create plot
fig, axs = plt.subplots(nrows = 1, ncols = 5, figsize = (12 ,12))

# plot random feature maps
for i, idx in enumerate(random_indexes):
    image_conv_feature_map = img_out_of_conv[:, idx, :, :] # index on the output tensor of the conv2d layer
    axs[i].imshow(image_conv_feature_map.squeeze().detach().numpy()) # removes batch dimension(squeeze()), remove from gradient tracking (detach()) and switch to numpy (numpy)
    axs[i].set(xticklabels = [], 
               yticklabels = [],
               xticks = [],
               yticks = [])

In [None]:
single_feature_map = img_out_of_conv[:, 0, :, :]
single_feature_map

In [None]:
from torch import nn
flatten_layer = nn.Flatten(start_dim = 2,
                           end_dim = 3)
print(f"{flatten_layer(img_out_of_conv).shape} -> (batch_size, embedding_dimension, no. of patches)")
print(f"Wanted order (batch_size, no. of patches, embeddin_dimension)")


In [None]:
plt.imshow(image.permute(1, 2, 0))
plt.title(class_names[label])
plt.axis(False)
print(f"Original shape : {image.shape}")

# Turn image into feature maps
image_out_of_conv = conv2d(image.unsqueeze(dim = 0)) # add batch dimension
print(f"Image featue map (patches) shape : {img_out_of_conv.shape}")

# Flatten the feature map
image_out_of_flattened = flatten_layer(img_out_of_conv)
print(f"Flattened image feature map shape: {image_out_of_flattened.shape}")

In [None]:
# rearrage output of flattened layer
print(f"{image_out_of_flattened.permute(0, 2, 1).shape} -> (batch_size, number_of_patches, embedding dimension)")

In [None]:
image.shape

In [None]:
patch_size


In [None]:
image_out_of_flattened_pemuted = image_out_of_flattened.permute(0, 2, 1)
image_out_of_flattened_pemuted.shape

In [None]:
# get a single flattened feature map
single_flattened_feature_map = image_out_of_flattened_pemuted[:, :, 0]

# plot the flattened map visually 
plt.figure(figsize= (22, 22))
plt.imshow(single_flattened_feature_map.detach().numpy())
plt.title(f"Flattened feature map shape: {single_flattened_feature_map.shape}")
plt.axis(False)

In [None]:
def final_image(image, patch_size = None):
    print(f"Input shape of given image {image.shape}")
    if patch_size:
        OUT_CHANNELS = patch_size * patch_size * 3
    else:
        OUT_CHANNELS = 768
    conv2d = torch.nn.Conv2d(in_channels = 3,
                             out_channels = OUT_CHANNELS,
                             kernel_size = patch_size,
                             stride = patch_size,
                             padding = 0)
    a = conv2d(image.unsqueeze(dim = 0))
    print(f"shape a : {a.shape}")
    flatten_layer = torch.nn.Flatten(start_dim = 2,
                                     end_dim = -1)
    b = flatten_layer(a)
    print(b.shape)
    c = b.permute(0, 2, 1)
    print(c.shape)
    return c

In [None]:
c = final_image(image, patch_size=16)
c.shape

In [None]:
class PatchEmbedding(nn.Module):
    def __init__(self,
                 in_channels:int  = 3,
                 patch_size: int = 16,
                 embedding_dim : int = 768):
        super().__init__()
        self.patch_size = patch_size
        self.patcher = nn.Conv2d(in_channels = in_channels,
                                 out_channels = embedding_dim,
                                 kernel_size = patch_size,
                                 stride = patch_size,
                                 padding = 0)
        self.flattened = nn.Flatten(start_dim = 2,
                                  end_dim = 3)
        
    def forward(self, x):
        #check input are correct shape
        image_resolution = x.shape[-1]
        assert image_resolution % patch_size == 0, f"Input image size must be divisible by patch size , image shape: {image_resolution}, patch_size: {self.patch_size}"
        x_patched = self.patcher(x)
        x_flattened = self.flattened(x_patched)
        
        # maker sure the returned sequence embedding dimensions are in the right order (batch_size, number of patches, embedding_dimension)
        return x_flattened.permute(0, 2, 1)

In [None]:
torch.cuda.manual_seed(42)
torch.manual_seed(42)
patchify = PatchEmbedding(in_channels = 3, 
                          patch_size = 16,
                          embedding_dim = 768)
print(f"Input image size: {image.unsqueeze(dim = 0).shape}")
patch_embedding_image = patchify(image.unsqueeze(0))
print(f"Output patch embedding sequence shape: {patch_embedding_image.shape}")

In [None]:
batch_size = patch_embedding_image.shape[0]
embedding_dimension = patch_embedding_image.shape[2]
batch_size, embedding_dimension


In [None]:
# create class token embedding as a learnable parameter that shares the same size as the embedding dimension(D)
class_token = nn.Parameter(torch.ones(batch_size, 1, embedding_dimension), # torch.randn() is used, torch.ones() to understand only
                           requires_grad = True)
class_token.shape

In [None]:
patch_embedded_image_with_class_embedding = torch.cat((class_token, patch_embedding_image),
                                                      dim = 1) # no. of patches dimension
print(patch_embedded_image_with_class_embedding)
print(f"Sequence of patch embeddings with class token prepended {patch_embedded_image_with_class_embedding.shape} -> (batch_size, class_token + no_of_patches, embedding_dimension)")

In [None]:
# creating the position embedding
# create a series of 1D learnable position embeddings and to add them to the sequence of patch embedding
# NOTE: after adding the position embedding shape will remain the same

In [None]:
patch_embedded_image_with_class_embedding, patch_embedded_image_with_class_embedding.shape

In [None]:
number_of_patches = int((height * width) / patch_size **2)
number_of_patches
embedding_dimension = patch_embedded_image_with_class_embedding.shape[-1]
embedding_dimension
position_embedding = nn.Parameter(torch.ones(1,
                                             number_of_patches + 1,
                                             embedding_dimension),
                                    requires_grad = True)
position_embedding, position_embedding.shape

In [None]:
patch_and_position_embedding = patch_embedded_image_with_class_embedding + position_embedding
patch_and_position_embedding, patch_and_position_embedding.shape


In [None]:
def myfunction_put_it_all_together_eqn1(image, patch_size):
    print(f"Shape of input image: {image.shape}")
    assert (image.shape[1] * image.shape[2]) % patch_size ** 2 == 0 ,f"Input image_size must be divisible by patch_size"
    no_of_patches = int(image.shape[1] * image.shape[2] / patch_size ** 2)
    print(f"No. of patches = {no_of_patches}")
    conv = torch.nn.Conv2d(in_channels=image.shape[0],
                           out_channels= (patch_size ** 2 ) * image.shape[0],
                           kernel_size= patch_size,
                           stride = patch_size,
                           padding = 0)(image)
    #print(conv.shape)
    flat = torch.nn.Flatten(start_dim = 1, end_dim = -1)(conv)
    #print(flat.shape)
    flat = flat.permute(1, 0).unsqueeze(dim = 0)
    #print(flat.shape)
    class_emb = torch.nn.Parameter(torch.ones(1, batch_size, (patch_size ** 2) * image.shape[0]),
                                   requires_grad = True)
    #print(class_emb.shape)
    img_and_class_emb = torch.cat((class_emb, flat), dim = 1)
    #print(img_and_class_emb.shape)
    pos_emb = torch.nn.Parameter(torch.ones(1, img_and_class_emb.shape[1], (patch_size ** 2) * image.shape[0]))
    print(pos_emb.shape)
    final_embedding = img_and_class_emb + pos_emb
    print(final_embedding)
    print(f"Final shape:{final_embedding.shape} -> (batch_size, class_name_emb + no. of batches, embedding_dimension)")

In [None]:
myfunction_put_it_all_together_eqn1(image=image,
                                    patch_size = 16)

In [None]:
# Putting it all together
torch.manual_seed(42)
torch.cuda.manual_seed(42)

patch_size= 16

# print shapes of the original image tensor and get the image dimensions
print(f"Image tensor shape: {image.shape}")
height, width = image.shape[1], image.shape[2]

# get image tensor and add a batch dimension
x = image.unsqueeze(0)
print(f"Input image shape: {x.shape}")
patch_embedding_layer = PatchEmbedding(in_channels = 3,
                                       patch_size = patch_size,
                                       embedding_dim = 768)
# pass input throught patch embedding
patch_embedding = patch_embedding_layer(x)
print(f"Patch embedding shape: {patch_embedding.shape}")

# create class token embedding
batch_size = patch_embedding.shape[0]
embedding_dimension = patch_embedding.shape[-1]
class_token = nn.Parameter(torch.ones(batch_size, 1, embedding_dimension),
                           requires_grad = True)
print(f"Class_token embedding shape: {class_token.shape}")
# prepend the class token embedding to patch embedding
patch_embedding_class_token = torch.cat((class_token, patch_embedding), dim = 1)
print(f"Patch embedding with class token shape: {patch_embedding_class_token.shape}")

#create 1D learnable parameters(position embedding)
number_of_patches = int((height * width) / patch_size ** 2)
print(f"No of patchees {number_of_patches}")
position_embedding = nn.Parameter(torch.ones(1, number_of_patches + 1, embedding_dimension),
                                  requires_grad = True)
print(f"position_embedding shape : {position_embedding.shape}")
# add the position embedding to patch embedding with class token
patch_and_position_embedding = patch_embedding_class_token + position_embedding
print(f"Patch adn position embedding shape: {patch_and_position_embedding.shape}")


In [None]:
# turning equation 2 to code

In [None]:
class MUltiHeadAtttentionBlock(nn.Module):
    ''' Creates a multi-head self-attention block (MSA block for short)'''
    def __init__(self,
                 embedding_dim: int = 768, # hidden size D
                 num_heads : int = 12, # heads from table 1
                 attn_dropout : float = 0):
        super().__init__()
        
        #create the norm layer(LN)
        self.layer_norm = nn.LayerNorm(normalized_shape = embedding_dim)

        # create multihead attention (MSA) layer

        self.multihead_attn = nn.MultiheadAttention(embed_dim = embedding_dim,
                                                    num_heads= num_heads,
                                                    dropout = attn_dropout,
                                                    batch_first = True # is the batch first? (batch, seq, features) -> (batch, number_of_patches, embedding_dimension)
                                                    )
    
    def forward(self, x):
        x = self.layer_norm(x)
        attn_output, _ = self.multihead_attn(query = x,
                                             key = x,
                                             value = x,
                                             need_weights = False, # is specified returns attn output weights in addition to attn outputs
                                             )
        return attn_output    

In [None]:
# Create an instance MSA block
multihead_self_attention_block = MUltiHeadAtttentionBlock(embedding_dim= 768,
                                                          num_heads= 12,
                                                          attn_dropout= 0)

# pass the patch and position image embedding sequence through MSA block
patched_image_through_msa_block = multihead_self_attention_block(patch_and_position_embedding)
print(f"Input shape of MSA block: {patch_and_position_embedding.shape}")
print(f"Ouput shape of MSA block : {patched_image_through_msa_block.shape}")

In [None]:
patch_and_position_embedding

In [None]:
patched_image_through_msa_block

In [None]:
# replicating equation 3: multilayer perceptron block(MLP Block)
# the mlp contains two layers with a GELU or non-linearity.
# GELU - (Gaussian Linear unit or GELU) is an activation function.
# GELU can be thought as smoother relu
# NOTE: layers can mean: fully-connected , dense, linear, feed-forward, all are often similar names for the same thing. 
# In PyTorch, they're often called `torch.nn.linear()` and in tensorflow they might be called `tf.keras.layers.Dense()`

# MLP no of hidden units = MLP size in table 1
# MLP 
# x = linear -> non-linear -> dropout -> linear -> dropout

In [None]:
class MLPBlock(nn.Module):
    def __init__(self,
                 embedding_dim : int = 768,
                 mlp_size : int = 3072,
                 dropout : float = 0.1):
        super().__init__()

        # create the norm layer(LN)
        self.layer_norm = torch.nn.LayerNorm(normalized_shape= embedding_dim)

        # create the mlp
        self.mlp = nn.Sequential(
            nn.Linear(in_features = embedding_dim,
                      out_features = mlp_size),
            nn.GELU(),
            nn.Dropout(p = dropout),
            nn.Linear(in_features= mlp_size,
                      out_features = embedding_dim),
            nn.Dropout(p = dropout),
        )
    
    def forward(self, x):
        x = self.layer_norm(x)
        x = self.mlp(x)
        return x

In [None]:
mlp_block = MLPBlock(embedding_dim = 768,
                     mlp_size = 3072,
                     dropout = 0.1)
patched_image_through_mlp_block = mlp_block(patched_image_through_msa_block)
print(f"INput shape of MLP block: {patched_image_through_msa_block.shape}")
print(f"Output shape of MSA block: {patched_image_through_mlp_block.shape}")

In [None]:
# transformer encoder is the combination of alternating layers of equation 2 and equation 3
# residual connections = add a layer(s) input to its subsequent output, this enables the creation of deeper networkrs(prevent weights from getting too small (gradient vanishing))

# transformer encoder
# x_input -> MSA_block -> [MSA_block_output + x_input] -> MLP_block -> [MLP_block_output + MSA_block_output + x_input] -> ...


In [None]:
class TransformerEncoderBlock(nn.Module):
    def __init__(self,
                 embedding_dim : int = 768,
                 num_heads : int = 12,
                 mlp_size : int  = 3072,
                 mlp_dropout: int = 0.1,
                 attn_dropout: int = 0):
        super().__init__()

        # create MSA block (equation 2)
        self.msa_block = MUltiHeadAtttentionBlock(embedding_dim=embedding_dim,
                                                  num_heads=num_heads,
                                                  attn_dropout = attn_dropout)
        
        # create MLP block (equation 3)
        self.mlp_block = MLPBlock(embedding_dim = embedding_dim,
                                  mlp_size = mlp_size,
                                  dropout = mlp_dropout)
    
    def forward(self, x):
        x = self.msa_block(x) + x # residual/skip connection for equation 2
        x = self.mlp_block(x) + x # residual/skip connection for equation 3
        return x


In [None]:
transformer_encoder_block = TransformerEncoderBlock()
import torchinfo
torchinfo.summary(model= transformer_encoder_block,
                  input_size = (1, 197, 768), #(batch_size, number_of_patches, embedding_dimension)
                  col_names =['input_size', 'output_size', 'num_params', 'trainable'],
                  col_width = 20,
                  row_settings = ['var_names'])


In [None]:
torch_transformer_encoder_layer = nn.TransformerEncoderLayer(d_model = 768, #embedding_dimension
                                                            nhead = 12,
                                                            dim_feedforward = 3072, # mlp size
                                                            dropout = 0.1,
                                                            activation = 'gelu',
                                                            batch_first = True,
                                                            norm_first = True,
                                                            )
torch_transformer_encoder_layer

In [None]:
from torchinfo import summary

In [None]:
summary(torch_transformer_encoder_layer)

In [None]:
summary(model = torch_transformer_encoder_layer,
        input_size = [1, 197, 768],
        col_names = ['input_size', 'output_size', 'num_params', 'trainable'],
        col_width = 20,
        row_settings = ['var_names'])

In [None]:
# putting it all together

In [None]:
# create a ViT class
class ViT(nn.Module):
    def __init__(self,
                 img_size : int = 224,
                 in_channels: int = 3,
                 patch_size : int = 16,
                 num_transformers_layers : int = 12,
                 embedding_dim : int = 768,
                 mlp_size : int = 3072,
                 num_heads : int = 12,
                 attn_dropout : int = 0,
                 mlp_dropout : int = 0.1,
                 embedding_dropout : int = 0.1,
                 num_classes : int = 1000): # number of classes in our classification problem
        super().__init__()

        # make an assertion that the image size is compatible with the patch size
        assert img_size % patch_size == 0 , f"Image size must be divisible by patch size, image: {img_size}, patch_size: {patch_size}"

        # calculate the number of patches(height * weight / patch **2)
        self.num_patches = (img_size * img_size) // patch_size **2

        # create learnable class embedding
        self.class_embedding = nn.Parameter(data = torch.randn(1, 1, embedding_dim),
                                            requires_grad = True)

        # create learnable position embedding
        self.position_embedding = nn.Parameter(data = torch.randn(1, self.num_patches + 1, embedding_dim))

        # create embedding dropout value
        self.embedding_dropout = nn.Dropout(p = embedding_dropout)

        # create patch embedding layer
        self.patch_embedding = PatchEmbedding(in_channels= in_channels,
                                              patch_size = patch_size,
                                              embedding_dim = embedding_dim)
        
        # create the transformer encoder block
                                                # here star(*) means all and then create list comprehension
                                                # the below code means turn all the transformer encoder block into sequential layers 
        self.transformer_encoder = nn.Sequential(*[TransformerEncoderBlock(embedding_dim = embedding_dim,
                                                                           num_heads= num_heads,
                                                                           mlp_size= mlp_size,
                                                                           mlp_dropout=mlp_dropout) for _ in range(num_transformers_layers)])
        
        # create classifier head(eqn 4)
        self.classifier = nn.Sequential(
            nn.LayerNorm(normalized_shape = embedding_dim),
            nn.Linear(in_features = embedding_dim,
                      out_features = num_classes)
        )
        
    def forward(self, x):
        # get the batch size
        batch_size = x.shape[0]

        # create class token embedding and expand it to match the batch size
        class_token = self.class_embedding.expand(batch_size, -1, -1) # -1 means to infer the dimensions

        # create the patch embedding (equation 1)
        x = self.patch_embedding(x)

        # concat class token embedding and patch embedding(dquation 1)
        x = torch.cat((class_token, x), dim = 1) #(batch_size, number_of_patches + 1, embedding_dim)

        # add position embedding to class token and patch embedding

        x = self.position_embedding + x

        # apply dropout to patch embedding("directly after adding positional - to patch embeddings")
        x = self.embedding_dropout(x)

        #pass position and patch embedding to transformer encoder
        x = self.transformer_encoder(x)

        # put 0th index logit through classifier (equation 4)
        x = self.classifier(x[:, 0])

        return x

In [None]:
batch_size = 32
embedding_dim = 768
class_embedding = nn.Parameter(data = torch.randn(1, 1, embedding_dim),
                               requires_grad = True)
class_embedding_expanded = class_embedding.expand(batch_size, -1, -1)
print(class_embedding.shape)
print(class_embedding_expanded.shape)

In [None]:
vit = ViT()
vit

In [None]:
random_image_tensor = torch.randn(1, 3, 224, 224)

vit = ViT(num_classes=len(class_names))

vit(rand_image_tensor)

In [None]:
# create visual summary of our ViT model
summary(model = ViT(num_classes = 1000),
        input_size = (1, 3, 224, 224),
        col_names = ['input_size', 'output_size', 'num_params', 'trainable'],
        col_width = 20,
        row_settings = ['var_names'])

In [None]:
vit.parameters()

In [None]:
# optimizer
optimizer = torch.optim.Adam(params = vit.parameters(),
                             lr= 1e-3,
                             betas = (0.9, 0.999),
                             weight_decay = 0.1)


In [None]:
#loss function/criterion
loss_fn = torch.nn.CrossEntropyLoss()


In [None]:
# Training out model
from going_modular import engine
results = engine.train(model = vit,
                       train_dataloader=train_dataloader,
                       test_dataloader= test_dataloader,
                       epochs = 10,
                       optimizer = optimizer,
                       loss_fn = loss_fn,
                       device = device)