# Impoert Libraries

In [3]:
from torchvision.models import resnet50, ResNet50_Weights
import os
import numpy as np
import sys
import glob
import torch
import torch.optim as optim
import torchvision
import torch.nn as nn
import torchvision.transforms as TF
import pandas as pd
import cv2
from torch.utils.data import DataLoader
from matplotlib import pyplot as plt
from torch.utils.data import Dataset
from transformers import ViTConfig, ViTModel
from PIL import Image
from collections import OrderedDict 


NameError: name '_C' is not defined

# Model

## CNN BackBones

### Forward Hook

In [2]:
class ModelHook(nn.Module):
    """
    A PyTorch module to retrieve the output of specified layers in a model using forward hooks.

    Args:
        model (nn.Module): The model from which the output is to be retrieved.
        output_layers (list): A list of layer names for which the output needs to be captured.

    Attributes:
        output_layers (list): A list of layer names for which the output needs to be captured.
        selected_out (OrderedDict): A dictionary to store the output of selected layers.
        model (nn.Module): The model from which the output is retrieved.
        fhooks (list): A list to hold the forward hooks registered for selected layers.

    Methods:
        forward_hook(layer_name): Method to create a forward hook for a specific layer.
        forward(x): Forward method of the module.

    Returns:
        out (torch.Tensor): The output tensor from the model's forward pass.
        selected_out (OrderedDict): A dictionary containing the output tensors of selected layers.

    Example:
        # Instantiate a ResNet model
        resnet_model = torchvision.models.resnet18(pretrained=True)

        # Define layers for which output needs to be captured
        output_layers = ['conv1', 'layer1', 'layer2']

        # Instantiate ModelHook module
        model_hook = ModelHook(resnet_model, output_layers)

        # Forward pass
        inputs = torch.randn(1, 3, 224, 224)
        out, selected_out = model_hook(inputs)

        # Output of selected layers can be accessed from 'selected_out' dictionary
        print(selected_out)
    """
    def __init__(self,model, output_layers, *args):
        super().__init__(*args)
        self.output_layers = output_layers
        # print(self.output_layers)
        self.selected_out = OrderedDict()
        #PRETRAINED MODEL
        self.model = model
        self.fhooks = []

        for l in list(self.model._modules.keys()):
            if l in self.output_layers:
                self.fhooks.append(getattr(self.model,l).register_forward_hook(self.forward_hook(l)))
    
    def forward_hook(self,layer_name):
        def hook(module, input, output):
            self.selected_out[layer_name] = output
        return hook

    def forward(self, x):
        out = self.model(x)
        return out, self.selected_out

### Modify CNN Model

In [None]:
class CNNBackBone(nn.Module):
    def __init__(self, *args, hidden_size = 768, hidden_dropout_prob = 0.0,attention_probs_dropout_prob = 0.0):
        super().__init__(*args)
        # Load and impliment models
        self.resnet50_module = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2).to("cuda")
        self.avg_pool = nn.AvgPool2d(kernel_size=4, stride=4).to("cuda")
        self.middle_linear = nn.Linear(512, hidden_size).to("cuda")
        self.end_linear = nn.Linear(2048, hidden_size).to("cuda")
        
        # Remove extra layers from CNN block (ResNetx) and add hook to it
        layers_dict = {name: module for name,
        module in zip(list(self.resnet50_module._modules.keys()),
                             list(self.resnet50_module.children())[:-2])} #all layers except last two
        self.resnet50_module = torch.nn.Sequential(OrderedDict(layers_dict))
        self.CNN_block = ModelHook(self.resnet50_module, ["layer2","layer4"])
        
        

    def forward(self, x):
        # Send originad tgrough the Resnetx model to extract middle and end layer output
        _, CNN_outputs = self.CNN_block(x)
        
        # Generate matrixe of size (-1,512,16,16) out of layer2 of ResNetx
        CNN_middle_layer_out = self.avg_pool(CNN_outputs["layer2"])
        # Generate matrixe of size (-1,2048,16,16) out of layer4 of ResNetx
        CNN_end_layer_out = CNN_outputs["layer4"]
        # print(CNN_middle_layer_out.shape)
        # print(CNN_end_layer_out.shape)
        
        # Merge dimentions of heigth and width of matrixes into each other and swap dimentions to generate 256 vectors with the length of 512 and 2048
        CNN_middle_layer_out = CNN_middle_layer_out.permute(0, 2, 3, 1).contiguous().view(1, -1, 512)
        CNN_end_layer_out = CNN_end_layer_out.permute(0, 2, 3, 1).contiguous().view(1, -1, 2048)
        # print(CNN_middle_layer_out.shape)
        # print(CNN_end_layer_out.shape)
        
        # Send vectors throgh an MLP layer to make the generate vectors with length of 768
        CNN_middle_layer_out = self.middle_linear(CNN_middle_layer_out)
        CNN_end_layer_out = self.end_linear(CNN_end_layer_out)   
        
        print(CNN_middle_layer_out.shape)
        print(CNN_end_layer_out.shape)
        return CNN_end_layer_out, CNN_middle_layer_out
    

## Transformer

### ViTEmbedding

In [None]:
class ViTEmbeddings(nn.Module):

    def __init__(self, config, num_patches=16*16):
        super().__init__()

        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
        
        # NOTE: THE ORIGINAL PatchEmbeddings ELIMINATED
        # self.patch_embeddings = PatchEmbeddings(
        #     image_size=config.image_size,
        #     patch_size=config.patch_size,
        #     num_channels=config.num_channels,
        #     embed_dim=config.hidden_size,
        # )
        # num_patches = self.patch_embeddings.num_patches
        # self.patch_embeddings = CNNBackBone()
        
        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
    
    
    def forward(self, pixel_values):
        print("modified embedding")
        batch_size = pixel_values.shape[0]
        
        # NOTE: THE ORIGINAL embedings ELIMINATED
        # embeddings = self.patch_embeddings(pixel_values)
        embeddings = pixel_values
        
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
        embeddings = embeddings + self.position_embeddings
        embeddings = self.dropout(embeddings)
        return embeddings
    

## Integration

In [24]:
x = torch.randn((1,3,512,512)).to("cuda")
configuration = ViTConfig(image_size =256)
out1_last, out1_middle = CNNBackBone()(x)
out2_last = ViTEmbeddings(configuration).to("cuda")(out1_last)
out2_middle = ViTEmbeddings(configuration).to("cuda")(out1_middle)

torch.Size([1, 512, 16, 16])
torch.Size([1, 2048, 16, 16])
torch.Size([1, 256, 512])
torch.Size([1, 256, 2048])
torch.Size([1, 256, 768])
torch.Size([1, 256, 768])
section2
section2


In [2]:
from transformers import ViTConfig, ViTModel
configuration = ViTConfig()

# Initializing a model (with random weights) from the vit-base-patch16-224 style configuration
model = ViTModel(configuration)

# Accessing the model configuration
configuration = model.config

In [3]:
model

ViTModel(
  (embeddings): ViTEmbeddings(
    (patch_embeddings): ViTPatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ViTEncoder(
    (layer): ModuleList(
      (0-11): 12 x ViTLayer(
        (attention): ViTAttention(
          (attention): ViTSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): ViTSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ViTIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation(

In [35]:
class ViTEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.layer = nn.ModuleList([ViTLayer(config) for _ in range(config.num_hidden_layers)])

    def forward(
        self,
        hidden_states,
        head_mask=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    ):
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None

        for i, layer_module in enumerate(self.layer):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            layer_head_mask = head_mask[i] if head_mask is not None else None

            if getattr(self.config, "gradient_checkpointing", False) and self.training:

                def create_custom_forward(module):
                    def custom_forward(*inputs):
                        return module(*inputs, output_attentions)

                    return custom_forward

                layer_outputs = torch.utils.checkpoint.checkpoint(
                    create_custom_forward(layer_module),
                    hidden_states,
                    layer_head_mask,
                )
            else:
                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)

            hidden_states = layer_outputs[0]

            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )
ViTEncoder(configuration)

ImportError: attempted relative import with no known parent package

In [16]:
from torchvision import models
from torchsummary import summary

configuration = ViTConfig(image_size =256)
model = ViTModel(configuration)

summary(model.embeddings.to("cuda"), (3, 256, 256))


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 768, 16, 16]         590,592
ViTPatchEmbeddings-2             [-1, 256, 768]               0
           Dropout-3             [-1, 257, 768]               0
Total params: 590,592
Trainable params: 590,592
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.75
Forward/backward pass size (MB): 4.51
Params size (MB): 2.25
Estimated Total Size (MB): 7.51
----------------------------------------------------------------


In [25]:
summary(model.encoder.to("cuda"), (1,1, 257, 768))


RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 6 is not equal to len(dims) = 4

In [11]:

# configuration = ViTConfig(image_size =256)
# m = ViTEmbeddings(configuration).to("cuda")
configuration = ViTConfig(image_size =256)
m = ViTModel(configuration).to("cuda").embeddings
x = torch.randn((1,3,256,256)).to("cuda")
m(x).shape

torch.Size([1, 257, 768])

In [2]:
from src.transformers.models.vit.modeling_vit import ViTEncoder, ViTAttention, ViTSelfAttention
import torch
from transformers import ViTConfig, ViTModel
configuration = ViTConfig(image_size =256)

m = ViTSelfAttention(configuration).to("cuda")
x = torch.randn((1, 257, 768)).to("cuda")
m(x)
m = ViTEncoder(configuration).to("cuda")
m(x)

BaseModelOutput(last_hidden_state=tensor([[[-2.5096, -0.7161, -1.1760,  ..., -0.4365, -2.1618,  1.1930],
         [ 0.6674,  2.5342,  1.0436,  ..., -0.0813,  0.3745,  0.4982],
         [-0.8910, -0.0924,  0.8046,  ...,  0.8526,  0.3763, -2.0456],
         ...,
         [ 0.2197, -1.9019,  0.4884,  ..., -1.1542, -2.1027, -1.7820],
         [-0.1396,  1.1063, -0.4004,  ..., -0.4058, -0.7316, -2.0000],
         [ 1.9031,  1.6353, -0.7972,  ..., -2.7012, -0.3411,  0.2982]]],
       device='cuda:0', grad_fn=<AddBackward0>), hidden_states=None, attentions=None)

NameError: name '__file__' is not defined

In [15]:
import sys
import os

SCRIPT_DIR = "D:\\desk top folders\\ML\\transformers\\src\\transformers\models\\vit"
sys.path.append(os.path.dirname(SCRIPT_DIR))
