<h2> Image Embedding </h2>

In [1]:
!pip install datasets
!pip install huggingface_hub
!pip install tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import cv2
from torchvision import transforms
from datasets import load_dataset
from huggingface_hub import login
import numpy as np
from torchvision import transforms
from torch.utils.data import Dataset,DataLoader
from tqdm import tqdm
from PIL import Image
from transformers import AutoTokenizer, AutoModel
class PatchEmbedings(torch.nn.Module):
    def __init__(self, img_size = 224, patch_size = 16, hidden_size = 768):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = (img_size // patch_size) ** 2
        #CONVOLOTUION FOR PATCH EXTRACTION
        self.conv = nn.Conv2d(in_channels = 3, out_channels = hidden_size, kernel_size = patch_size, stride = patch_size)

        nn.init.xavier_uniform_(self.conv.weight)
        if self.conv.bias is not None:
            nn.init.zeros_(self.conv.bias)

    def forward(self, x):

        if x.size(2) != self.img_size or x.size(3) != self.img_size:
            raise ValueError(f"Input image size is different than model trained one {x.shape}. \n It must be {self.img_size} x {self.img_size}")
        x = self.conv(x)
        x = x.flatten(2) #This way I remain the batches and channels unchanged and since the H&W are now H/patc_size = num_patches
        x = x.transpose(1, 2) #NOW THE TENSOR IS (Num_batches, num_patches, hidden_size_channels)

        return x


Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

<h2>Multi-Head Self Attention mechanism</h2>

In [15]:
class HeadAttentionLayer(nn.Module):
    def __init__(self, dropout, is_decoder, n_embd, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias = False)
        self.query = nn.Linear(n_embd,head_size, bias = False)
        self.value = nn.Linear(n_embd, head_size, bias = False)
        self.dropout = nn.Dropout(dropout)
        self.is_decoder = is_decoder

    def forward(self,x):
      #FIRST OF ALL, I GET THE VALUES FROM THE INPUT SHAPE, THEN I INSTANTIATE THE KEY,QUERY,VALUE LAYERS
        x = x.squeeze(1)
        batch_size,seq_length, num_channels = x.shape
        key = self.key(x)
        query = self.query(x)
        value = self.value(x)

      #THEN I CALCULATE DOT PRODUCT BETWEEN EACH QUERY AND KEYS
      #FINALLY I CHANGE THE SCALE OF THE OUPUT TO 1/SQRT(NUM_CHANNELS)
      #THIS LAST STEP IS DONE BECAUSE DOT PRODUCT VALUES CAN GET TO PRETTY HIGH VALUES, THEN SOFTMAX WILL HAVE VERY SMALL GRANDIENTS
      #WITH THIS WE AVOID VANISHING / EXPLODING GRADIENTS
        wei = query @ key.transpose(-2,-1) * (num_channels ** -0.5)

        if self.is_decoder:
            tril = torch.tril(torch.ones(seq_length, seq_length, dtype = torch.bool, device = x.device))
            wei = wei.masked_fill(tril == 0, float("-inf"))

        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        out = wei @ value

        return out

In [3]:
class MultiModalProjector(nn.Module):
    def __init__(self, n_embd, dropout = 0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, n_embd * 4),
            nn.GELU(),
            nn.Linear(n_embd*4, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self,x):
        x = self.net(x)
        return x

In [None]:
class VisionLanguageModel(nn.Module):
    def __init__(self, n_embd, image_embed_dim, vocab_size,
                 n_layer, img_size, patch_size, num_heads,
                 num_blks, emb_dropout, blk_dropout):
        super().__init__()
        num_hiddens = image_embed_dim
        assert num_hiddens % num_heads == 0

        self.vision_encoder = PatchEmbedings(96,16,512)
        self.language_decoder = HeadAttentionLayer(0.1,True,768,5)
        self.MultiModalProjector(768, 0.1)

    def forward(self, x)

<h2> Helper functions </h2>

In [None]:
def image_embedding(image,patch_size):
    print(f"Before unfold {image.shape}")
    patches = image.unfold(2,size = patch_size, step = patch_size).unfold(3,size = patch_size, step = patch_size)
    num_patches = ()
    num_patches_w = image.shape[2]//patch_size
    num_patches_h = image.shape[3]//patch_size
    num_patches = num_patches_h * num_patches_w
    print(f"After unfold {patches.shape}")
    #TODO .CONTIGUOUS IS NECCESARRY FOR .VIEW
    patches = patches.permute(0,2,3,1,4,5).contiguous()
    print(patches.shape)
    patches = patches.view(image.shape[0], num_patches, -1)
    print(f"Patches shape: {patches.shape}")


<h2> Main code </h2>

In [24]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self,features,labels,patch_embedding):
    #TRAINING EXAMPLES
    self.features = features
    self.labels = labels
    self.patch_embedding = patch_embedding

    #TRANSFORMS
    self.train_transforms = transforms.Compose([
        transforms.Resize((224,224)),
        transforms.RandomHorizontalFlip(p = 0.4),
        transforms.RandomRotation(10),
        transforms.ToTensor(),
        transforms.Normalize(mean = [0.5,0.5,0.5], std = [0.5, 0.5 ,0.5])
    ])

    #TOKENZATION
    self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    self.embedding = nn.Embedding(self.tokenizer.vocab_size,768)


  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):

    features_iter = self.features[idx]
    labels_iter = self.labels[idx]

    label_token = self.tokenizer(labels_iter,padding = "max_length", max_length=14, truncation = True, return_tensors = "pt")
    feature_transform = self.train_transforms(features_iter)

    #I WILL ADD ONE COORDINATE TO USE AS BATCH_SIZE SO I CAN PASS THIS INTO PATCH_EMBEDDING
    feature_transform = feature_transform.view(1, feature_transform.shape[0],feature_transform.shape[1],feature_transform.shape[2])
    feature_transform = self.patch_embedding(feature_transform)
    feature_transform = feature_transform.squeeze(0)



    label_tokens = self.embedding(label_token['input_ids'])
    label_tokens_final = label_tokens.unsqueeze(0)

    attention_mask = label_token['attention_mask']
    attention_mask = self.embedding(attention_mask).unsqueeze(0)

    return feature_transform,label_tokens_final, attention_mask

def model_training(dataloader, patch_embedding, head_attention, multi_modal_projector):

  for (x,y,z) in tqdm(dataloader, desc = "TRAINING"):
    print(f"Tokens {y.shape}")
    y = y.view(y.shape[0], y.shape[3], y.shape[4])
    print(f"Tokens {y.shape}")

    x = x.to("cpu")
    y = y.to("cpu")
    z = z.to("cpu")

    modal_output = multi_modal_projector(x)

    image_text = torch.cat([modal_output, y], dim=1)
    text_output = head_attention(image_text)

    #prediction = head(x)
    #print(f"Prediction shape {prediction.shape}")
  return text_output

def tokenizer_trials(tokens):
  max_length = 0
  for dict in tokens:
    for sentences in dict['tokens']:
      if len(sentences) > max_length:
        max_length = len(sentences)

  print(max_length)

In [6]:
from google.colab import drive
import os

login(token="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")
dataset = load_dataset("xcpan/coco2017", split ="train")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/329 [00:00<?, ?B/s]

train-00000-of-00011.parquet:   0%|          | 0.00/477M [00:00<?, ?B/s]

train-00001-of-00011.parquet:   0%|          | 0.00/469M [00:00<?, ?B/s]

train-00002-of-00011.parquet:   0%|          | 0.00/481M [00:00<?, ?B/s]

train-00003-of-00011.parquet:   0%|          | 0.00/480M [00:00<?, ?B/s]

train-00004-of-00011.parquet:   0%|          | 0.00/482M [00:00<?, ?B/s]

train-00005-of-00011.parquet:   0%|          | 0.00/484M [00:00<?, ?B/s]

train-00006-of-00011.parquet:   0%|          | 0.00/491M [00:00<?, ?B/s]

train-00007-of-00011.parquet:   0%|          | 0.00/478M [00:00<?, ?B/s]

train-00008-of-00011.parquet:   0%|          | 0.00/479M [00:00<?, ?B/s]

train-00009-of-00011.parquet:   0%|          | 0.00/484M [00:00<?, ?B/s]

train-00010-of-00011.parquet:   0%|          | 0.00/475M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/102512 [00:00<?, ? examples/s]

Dataset({
    features: ['caption', 'image'],
    num_rows: 102512
})


<h2> About the following cell </h2>

<p> First we are going to grab just a part of the dataset, I am doing this to check that the whole pipeline is working correctly, also to debug faster. During training time this part will be commented since we want to use the whole dataset </p>

<p> Second I instantiate the PatchEmbeddings that will transform images into patches to get them embedded, after that I initialize the HeadAttention layer that I am going to use for the self-attention mechanism that will help me compute the context of the captions.

<p> Eventually I get the training images and captions to train the model, create a customized Dataset class that will inherit from torchvision Dataset class and I instantiate a Dataloader to iterate over that Dataset class.

Finally I call model training, this is the function that start the training pipeline.

<h3> Disclaimer </h3>
You have probably noticed that I didn't pass the images(features) through the PatchEmbeddings, maybe you also realised that I am passing the patch object to the Dataset class. This is because when the dataloader iterate over each images, they are going to be embedded but not before that.

In [25]:
from datasets import load_from_disk
from PIL import Image

#FIRST WE GET JUST A PART OF THE DATASET
  #THIS IS DONE SO I CAN CHECK THAT THE PIPELINE IS WORKING, DURING TRAINING THIS WILL BE COMMENTED
small_dataset = dataset.shuffle(seed=42).select(range(500))


patch = PatchEmbedings()
head_attention = HeadAttentionLayer(0.1, True, 768, 5)
multi_modal_projector = MultiModalProjector(768)
#TODO USE THIS TO TRAIN WITH A LOWER VERSION OF THE DATASET
  #dataset = load_dataset("HuggingFaceM4/COCO", split = "train")

images = small_dataset['image']
caption = small_dataset['caption']
#images, conversations = data_checking(images, conversations)

#new_tokens = [token['tokens'] for token in tokens]
dataset_class = Dataset(images, caption, patch)
training_loader = DataLoader(dataset_class, batch_size = 32, shuffle=True)

text_output= model_training(training_loader, patch, head_attention, multi_modal_projector)
print(f"Post training checking \n Head attention (text) output shape {text_output.shape}")



TRAINING:   0%|          | 0/16 [00:00<?, ?it/s]

Tokens torch.Size([32, 1, 1, 14, 768])
Tokens torch.Size([32, 14, 768])


TRAINING:   6%|▋         | 1/16 [00:02<00:30,  2.02s/it]

Tokens torch.Size([32, 1, 1, 14, 768])
Tokens torch.Size([32, 14, 768])


TRAINING:  12%|█▎        | 2/16 [00:03<00:23,  1.71s/it]

Tokens torch.Size([32, 1, 1, 14, 768])
Tokens torch.Size([32, 14, 768])


TRAINING:  19%|█▉        | 3/16 [00:04<00:20,  1.60s/it]

Tokens torch.Size([32, 1, 1, 14, 768])
Tokens torch.Size([32, 14, 768])


TRAINING:  25%|██▌       | 4/16 [00:06<00:20,  1.71s/it]

Tokens torch.Size([32, 1, 1, 14, 768])
Tokens torch.Size([32, 14, 768])


TRAINING:  31%|███▏      | 5/16 [00:08<00:18,  1.71s/it]

Tokens torch.Size([32, 1, 1, 14, 768])
Tokens torch.Size([32, 14, 768])


TRAINING:  38%|███▊      | 6/16 [00:09<00:15,  1.55s/it]

Tokens torch.Size([32, 1, 1, 14, 768])
Tokens torch.Size([32, 14, 768])


TRAINING:  44%|████▍     | 7/16 [00:11<00:13,  1.46s/it]

Tokens torch.Size([32, 1, 1, 14, 768])
Tokens torch.Size([32, 14, 768])


TRAINING:  50%|█████     | 8/16 [00:12<00:11,  1.39s/it]

Tokens torch.Size([32, 1, 1, 14, 768])
Tokens torch.Size([32, 14, 768])


TRAINING:  56%|█████▋    | 9/16 [00:13<00:09,  1.35s/it]

Tokens torch.Size([32, 1, 1, 14, 768])
Tokens torch.Size([32, 14, 768])


TRAINING:  62%|██████▎   | 10/16 [00:14<00:08,  1.37s/it]

Tokens torch.Size([32, 1, 1, 14, 768])
Tokens torch.Size([32, 14, 768])


TRAINING:  69%|██████▉   | 11/16 [00:16<00:06,  1.35s/it]

Tokens torch.Size([32, 1, 1, 14, 768])
Tokens torch.Size([32, 14, 768])


TRAINING:  75%|███████▌  | 12/16 [00:17<00:05,  1.34s/it]

Tokens torch.Size([32, 1, 1, 14, 768])
Tokens torch.Size([32, 14, 768])


TRAINING:  81%|████████▏ | 13/16 [00:19<00:04,  1.51s/it]

Tokens torch.Size([32, 1, 1, 14, 768])
Tokens torch.Size([32, 14, 768])


TRAINING:  88%|████████▊ | 14/16 [00:21<00:03,  1.58s/it]

Tokens torch.Size([32, 1, 1, 14, 768])
Tokens torch.Size([32, 14, 768])


TRAINING:  94%|█████████▍| 15/16 [00:22<00:01,  1.49s/it]

Tokens torch.Size([20, 1, 1, 14, 768])
Tokens torch.Size([20, 14, 768])


TRAINING: 100%|██████████| 16/16 [00:23<00:00,  1.46s/it]

Post training checking 
 Head attention (text) output shape torch.Size([20, 210, 5])



