<h2> Image Embedding </h2>

In [None]:
!pip install datasets
!pip install huggingface_hub
!pip install tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import cv2
from torchvision import transforms
from datasets import load_dataset
from huggingface_hub import login
import numpy as np
from torchvision import transforms
from torch.utils.data import Dataset,DataLoader
from tqdm import tqdm
from PIL import Image
from transformers import AutoTokenizer, AutoModel
class PatchEmbedings(torch.nn.Module):
    def __init__(self, img_size = 224, patch_size = 16, hidden_size = 768):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = (img_size // patch_size) ** 2
        #CONVOLOTUION FOR PATCH EXTRACTION
        self.conv = nn.Conv2d(in_channels = 3, out_channels = hidden_size, kernel_size = patch_size, stride = patch_size)

        nn.init.xavier_uniform_(self.conv.weight)
        if self.conv.bias is not None:
            nn.init.zeros_(self.conv.bias)

    def forward(self, x):

        if x.size(2) != self.img_size or x.size(3) != self.img_size:
            raise ValueError(f"Input image size is different than model trained one {x.shape}. \n It must be {self.img_size} x {self.img_size}")
        x = self.conv(x)
        x = x.flatten(2) #This way I remain the batches and channels unchanged and since the H&W are now H/patc_size = num_patches
        x = x.transpose(1, 2) #NOW THE TENSOR IS (Num_batches, num_patches, hidden_size_channels)

        return x




<h2>Multi-Head Self Attention mechanism</h2>

In [None]:
class Head(nn.Module):
    def __init__(self, dropout, is_decoder, n_embd, head_size):
        super().__init__(n_embd, head_size)
        self.key = nn.Linear(n_embd, head_size, bias = False)
        self.query = nn.Linear(n_embd,head_size, bias = False)
        self.value = nn.Linear(n_embd, head_size, bias = False)
        self.dropout = nn.Dropout(dropout)
        self.is_decoder = is_decoder

    def forward(self,x):
        num_batches,seq_length,num_channels = x.shape
        key = self.key(x)
        query = self.query(x)
        value = self.value(x)

        wei = query @ key.transpose(-2,-1) * (num_channels ** -0.5)

        if self.is_decoder:
            tril = torch.tril(torch.ones(seq_length, seq_length, dtpye = torch.bool, device = x.device))
            wei = wei.masked_fill(tril == 0, float = "-inf")

        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        out = wei @ value

        return out

In [None]:
class MultiModalProjector(nn.Module):
    def __init__(self, n_embd, image_embed_dim, dropout = 0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(image_embed_dim, image_embed_dim * 4),
            nn.GeLU(),
            nn.Linear(image_embed_dim*4, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self,x):
        x = self.net(x)
        return x

In [None]:
class VisionLanguageModel(nn.Module):
    def __init__(self, n_embd, image_embed_dim, vocab_size,
                 n_layer, img_size, patch_size, num_heads,
                 num_blks, emb_dropout, blk_dropout):
        super().__init__()
        num_hiddens = image_embed_dim
        assert num_hiddens % num_heads == 0

        self.vision_encoder = PatchEmbedings(96,16,512)
        self.decoder = Head(0.1,True,512,)

<h2> Helper functions </h2>

In [None]:
def image_embedding(image,patch_size):
    print(f"Before unfold {image.shape}")
    patches = image.unfold(2,size = patch_size, step = patch_size).unfold(3,size = patch_size, step = patch_size)
    num_patches = ()
    num_patches_w = image.shape[2]//patch_size
    num_patches_h = image.shape[3]//patch_size
    num_patches = num_patches_h * num_patches_w
    print(f"After unfold {patches.shape}")
    #TODO .CONTIGUOUS IS NECCESARRY FOR .VIEW
    patches = patches.permute(0,2,3,1,4,5).contiguous()
    print(patches.shape)
    patches = patches.view(image.shape[0], num_patches, -1)
    print(f"Patches shape: {patches.shape}")


<h2> Main code </h2>

In [None]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self,features,labels):
    #TRAINING EXAMPLES
    self.features = features
    self.labels = labels

    #TRANSFORMS
    self.train_transforms = transforms.Compose([
        transforms.Resize((224,224)),
        transforms.RandomHorizontalFlip(p = 0.4),
        transforms.RandomRotation(10),
        transforms.ToTensor(),
        transforms.Normalize(mean = [0.5,0.5,0.5], std = [0.5, 0.5 ,0.5])
    ])

    #TOKENZATION
    self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    self.embedding = nn.Embedding(self.tokenizer.vocab_size,768)


  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):


    features_iter = self.features[idx]
    labels_iter = self.labels[idx]

    label_token = self.tokenizer(labels_iter,padding = "max_length", max_length=14, truncation = True, return_tensors = "pt")
    feature_transform = self.train_transforms(features_iter)

    label_tokens = self.embedding(label_token['input_ids'])
    label_tokens_final = label_tokens.unsqueeze(0)

    attention_mask = label_token['attention_mask']
    attention_mask = self.embedding(attention_mask).unsqueeze(0)

    return feature_transform,label_tokens_final, attention_mask




    return feature_transform, label_tokens_final


def model_training(dataloader, head):

  for (x,y,z) in tqdm(dataloader, desc = "TRAINING"):
    x = x.to("cpu")
    y = x.to("cpu")
    z = z.to("cpu")
    print(f"X shape {x.shape}")
    prediction = head(x)
    print(prediction.shape)

def tokenizer_trials(tokens):
  max_length = 0
  for dict in tokens:
    for sentences in dict['tokens']:
      if len(sentences) > max_length:
        max_length = len(sentences)

  print(max_length)








In [None]:
from google.colab import drive
import os

login(token="")
dataset = load_dataset("xcpan/coco2017", split ="train")
print(dataset)

Dataset({
    features: ['caption', 'image'],
    num_rows: 102512
})


In [None]:
from datasets import load_from_disk
from PIL import Image


small_dataset = dataset.shuffle(seed=42).select(range(500))
patch = PatchEmbedings()
#TODO USE THIS TO TRAIN WITH A LOWER VERSION OF THE DATASET
  #dataset = load_dataset("HuggingFaceM4/COCO", split = "train")

images = small_dataset['image']
caption = small_dataset['caption']
#images, conversations = data_checking(images, conversations)

#new_tokens = [token['tokens'] for token in tokens]
dataset_class = Dataset(images, caption)
training_loader = DataLoader(dataset_class, batch_size = 32, shuffle=True)

feature_transform, final_tokens, attention_mask = model_training(training_loader, patch)

TRAINING:   6%|▋         | 1/16 [00:00<00:05,  2.89it/s]

X shape torch.Size([32, 3, 224, 224])
torch.Size([32, 196, 768])


TRAINING:  12%|█▎        | 2/16 [00:00<00:04,  2.90it/s]

X shape torch.Size([32, 3, 224, 224])
torch.Size([32, 196, 768])


TRAINING:  19%|█▉        | 3/16 [00:01<00:04,  3.00it/s]

X shape torch.Size([32, 3, 224, 224])
torch.Size([32, 196, 768])


TRAINING:  25%|██▌       | 4/16 [00:01<00:04,  2.88it/s]

X shape torch.Size([32, 3, 224, 224])
torch.Size([32, 196, 768])


TRAINING:  31%|███▏      | 5/16 [00:01<00:03,  2.94it/s]

X shape torch.Size([32, 3, 224, 224])
torch.Size([32, 196, 768])


TRAINING:  38%|███▊      | 6/16 [00:02<00:03,  2.93it/s]

X shape torch.Size([32, 3, 224, 224])
torch.Size([32, 196, 768])
X shape torch.Size([32, 3, 224, 224])


TRAINING:  44%|████▍     | 7/16 [00:02<00:03,  2.56it/s]

torch.Size([32, 196, 768])
X shape torch.Size([32, 3, 224, 224])


TRAINING:  50%|█████     | 8/16 [00:03<00:03,  2.33it/s]

torch.Size([32, 196, 768])
X shape torch.Size([32, 3, 224, 224])


TRAINING:  56%|█████▋    | 9/16 [00:03<00:03,  2.22it/s]

torch.Size([32, 196, 768])
X shape torch.Size([32, 3, 224, 224])


TRAINING:  62%|██████▎   | 10/16 [00:04<00:02,  2.14it/s]

torch.Size([32, 196, 768])
X shape torch.Size([32, 3, 224, 224])


TRAINING:  69%|██████▉   | 11/16 [00:04<00:02,  2.10it/s]

torch.Size([32, 196, 768])


TRAINING:  75%|███████▌  | 12/16 [00:04<00:01,  2.15it/s]

X shape torch.Size([32, 3, 224, 224])
torch.Size([32, 196, 768])


TRAINING:  81%|████████▏ | 13/16 [00:05<00:01,  2.37it/s]

X shape torch.Size([32, 3, 224, 224])
torch.Size([32, 196, 768])


TRAINING:  88%|████████▊ | 14/16 [00:05<00:00,  2.57it/s]

X shape torch.Size([32, 3, 224, 224])
torch.Size([32, 196, 768])


TRAINING:  94%|█████████▍| 15/16 [00:05<00:00,  2.73it/s]

X shape torch.Size([32, 3, 224, 224])
torch.Size([32, 196, 768])


TRAINING: 100%|██████████| 16/16 [00:06<00:00,  2.59it/s]

X shape torch.Size([20, 3, 224, 224])
torch.Size([20, 196, 768])





TypeError: cannot unpack non-iterable NoneType object