In [1]:
! pip install pyarrow matplotlib sentencepiece pandas opencv-python
import torch
import os
import cv2
import sys
import numpy as np
import math
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from torch.nn import TransformerDecoder, TransformerDecoderLayer
import sentencepiece as spm
import matplotlib.pyplot as plt
import multiprocessing
import time
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

if torch.backends.mps.is_available():  # Check for Apple Silicon GPU availability (requires PyTorch 1.12 or later)
    device = torch.device("mps")
elif torch.cuda.is_available():  # Check for NVIDIA GPU availability
    device = torch.device("cuda")
    torch.cuda.empty_cache()
else:
    device = torch.device("cpu")  # Fall back to CPU

print(f"Using device: {device}")

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Using device: cuda


Dataset Import

In [2]:
# Read data from both Parquet files
# train_0 = pd.read_parquet('../dataset/0000.parquet')
# train_1 = pd.read_parquet('../dataset/0001.parquet')
# train_2 = pd.read_parquet('../dataset/0002.parquet')
# train_3 = pd.read_parquet('../dataset/0003.parquet')
# train_4 = pd.read_parquet('../dataset/0004.parquet')
# train_5 = pd.read_parquet('../dataset/0005.parquet')
# train_6 = pd.read_parquet('../dataset/0006.parquet')
# train_7 = pd.read_parquet('../dataset/0007.parquet')
# train_8 = pd.read_parquet('../dataset/0008.parquet')

# train = pd.concat([train_0, train_1, train_2, train_3, train_4, train_5, train_6, train_7, train_8,], ignore_index=True)
# train = train.reset_index(drop=True)

# print(train.shape)
# print(train[:2])
# train = train.iloc[:100]

In [3]:
sp = spm.SentencePieceProcessor()
sp.Load('../dataset/spm_10000_vocab_model.model')

True

Dataset

In [4]:
class CaptionDataset(Dataset):
    def __init__(self, data_file_path, image_side, patch_side, sp,  device, validate = False):
        super().__init__()
        self.sp = sp
        if validate == True:
            self.data = pd.read_parquet(data_file_path) [:100]
        else:
            self.data = pd.read_parquet(data_file_path)
        self.dataset = []
        self.device = device

        for row in self.data.itertuples():
            image = self.tranformImageForPatching(row.image, image_side, patch_side)
            patches = self.patchImage(image, image_side, patch_side)
            imageTokens = self.patchesToTokens(patches)

            for caption in row.caption:
                self.dataset.append([
                    row.image,  
                    imageTokens, 
                    caption,
                ])
    
    def tranformImageForPatching(self, image, image_side, patch_side):
        image_pxl = cv2.imdecode(np.frombuffer(image['bytes'], np.uint8), cv2.IMREAD_COLOR)
        image_pxl = cv2.cvtColor(image_pxl, cv2.COLOR_BGR2RGB)
        cropped_image = self.cropImage(image_pxl)
        # padded_image = self.addPadding(image_pxl, patch_side)

        resized_image = cv2.resize(cropped_image,  (image_side, image_side), interpolation=cv2.INTER_AREA)        

        return resized_image
    
    def addPadding(self, image, patch_side):
        ## For flaxible tokens amount
        # padded_height = ((image.shape[0] // patch_side) + 1) * patch_side
        # padded_width = ((image.shape[1] // patch_side) + 1) * patch_side
        # padded_image = cv2.copyMakeBorder(image, 0, padded_height - image.shape[0], 0, padded_width - image.shape[1], cv2.BORDER_REFLECT)
        
        # Get dimensions of the image
        height, width = image.shape[:2]
        pad_size = max(width, height)
        delta_w = pad_size - width
        delta_h = pad_size - height
        top, bottom = delta_h // 2, delta_h - (delta_h // 2)
        left, right = delta_w // 2, delta_w - (delta_w // 2)

        # Pad the image with black color (0, 0, 0) BORDER_CONSTANT
        image_padded = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0))

        return image_padded
    
    def cropImage(self, image):
          # Get the dimensions of the image
        height, width = image.shape[:2]
        side_length = min(height, width)

        # Calculate coordinates to crop the central square
        top = (height - side_length) // 2
        left = (width - side_length) // 2
        bottom = top + side_length
        right = left + side_length

        # Crop the central square
        cropped_image = image[top:bottom, left:right]
        return cropped_image
    
    def patchImage(self, image, image_side, patch_side):
        patches = []
        for y in range(0, image_side, patch_side):
            for x in range(0, image_side, patch_side):
                # Extract the block
                block = image[y:y+patch_side, x:x+patch_side]
                patches.append(block)
 
        return patches
    
    def patchesToTokens(self, patches):
        tokens = []
        for patch in patches:
            patch_tensor = torch.tensor(patch, device = self.device).permute(2, 0, 1).float()  # CHW format
            normalized_patch = patch_tensor / 255.0
            normalized_patch = normalized_patch.reshape(-1)
            tokens.append(normalized_patch)
        return torch.stack(tokens, dim=0)
    
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        image = self.dataset[idx][0]
        imageTokens = self.dataset[idx][1]
        caption_input  = torch.tensor([sp.PieceToId('<s>')] + self.sp.EncodeAsIds(self.dataset[idx][2]), dtype=torch.long, device = self.device)
        caption_target = torch.tensor(self.sp.EncodeAsIds(self.dataset[idx][2]) + [sp.PieceToId('</s>')], dtype=torch.long, device = self.device)
        return image, imageTokens, caption_input, caption_target

Padding Function

In [5]:
def collate_fn(batch):
    image, imagesTokens, captionsInputs, captionsTargets = zip(*batch)  # Unzip the batch into inputs and targets
    inputs_padded = pad_sequence(captionsInputs, batch_first=True, padding_value=0)
    targets_padded = pad_sequence(captionsTargets, batch_first=True, padding_value=0)
    return image, torch.stack(imagesTokens, dim=0), inputs_padded, targets_padded

Model

In [6]:
class Head(nn.Module):
    def __init__(self, embed_size, head_size, dropout, device):
        super().__init__()
        self.head_size      = head_size
        self.embed_size     = embed_size
        self.device         = device
        
        self.Key   = nn.Linear(self.embed_size, self.head_size, bias=False, device = self.device) # Size: [embed_size x head_size]
        self.Query = nn.Linear(self.embed_size, self.head_size, bias=False, device = self.device) # Size: [embed_size x head_size] 
        self.Value = nn.Linear(self.embed_size, self.head_size, bias=False, device = self.device) # Size: [embed_size x head_size] 
        
        self.Dropout = nn.Dropout(dropout)
        
    def forward(self, q, k , v, dont_look_ahead = True, padding_mask = None):
        batchSize, tokens, head_size = q.shape
        query = self.Query(q)  # Size: [batchSize x tokens x head_size]
        key   = self.Key(k)    # Size: [batchSize x tokens x head_size]
        value = self.Value(v)  # Size: [batchSize x tokens x head_size]

        tril = torch.tril(torch.ones(tokens, tokens, device = self.device))               # Size: [tokens_amount x tokens_amount]. Diagonale ones left side only.                                                                      

        # Compute Attention scores ("Affinities")
        attention = query @ key.transpose(-2, -1) * head_size**0.5                        # [Batch Size x Tokens amount x head_size] @ [Batch Size x head_size x Tokens amount] --> [Batch Size x Tokens amount x Tokens amount]

        if padding_mask is not None:
            attention = attention.masked_fill(padding_mask == 0, float(-1e9))           # Size: [batchSize x tokens x tokens]
        if dont_look_ahead == True :
            attention = attention.masked_fill(tril[:tokens, :tokens] == 0, float(-1e9)) # Size: [batchSize x tokens x tokens]
        
        attention = F.softmax(attention, dim=-1)                                          # Size: [batchSize x tokens x tokens]
        attention = self.Dropout(attention)
        
        out = attention @ value                                                           # Size: [Batch Size x Tokens Amount x head_size]
        return out                                                                        # Size: [Batch Size x Tokens Amount x head_size]
    
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, num_heads, head_size, dropout, device):
        super().__init__()  
        self.num_heads  = num_heads
        self.head_size  = head_size
        self.embed_size = embed_size 
        self.device     = device
        
        self.Heads = nn.ModuleList()
        for _ in range(num_heads):
            self.Heads.append(Head(self.embed_size, self.head_size, dropout, self.device)) # ModuleList Size: [num_heads]

        self.Projection = nn.Linear(self.embed_size, self.embed_size)    # Size: [Batch Size x Tokens Amount x embed_size]
        self.Dropout = nn.Dropout(dropout)

    def forward(self, q, k, v, dont_look_ahead = True, mask=None):
        multiHead = torch.cat([head(q, k, v, dont_look_ahead, mask) for head in self.Heads], dim=-1)  # Size: [Batch Size x Tokens Amount x embed_size]
        projection = self.Dropout(self.Projection(multiHead))            # Size: [Batch Size x Tokens Amount x embed_size]
        return projection                                                # Size: [Batch Size x Tokens Amount x embed_size]

class FeedForward(nn.Module):
    def __init__(self, embed_size, dropout):
        super().__init__()
        self.FeedForward = nn.Sequential(
            nn.Linear(embed_size, 4 * embed_size),  # Size: [Batch Size x Tokens Amount x embed_size]
            nn.ReLU(),
            nn.Linear(4 * embed_size, embed_size),  # Size: [Batch Size x Tokens Amount x embed_size]
            nn.Dropout(dropout)
        )
    
    def forward(self, attentions):
        return self.FeedForward(attentions)
    
class DecoderBlock(nn.Module):
    def __init__(self,  embed_size, num_heads, head_size, dropout, device):
        super().__init__()
        self.embed_size = embed_size
        self.num_heads  = num_heads
        self.head_size  = head_size
        self.device     = device

        self.MultiAttentionHeads = MultiHeadAttention(self.embed_size, self.num_heads, self.head_size, dropout, self.device) # Size: [Batch Size x Tokens Amount x head_size]
        self.FeedForward         = FeedForward(self.embed_size, dropout)   # Size: [Batch Size x Tokens Amount x head_size]
        self.Ln1                 = nn.LayerNorm(self.embed_size)  # Size: [Batch Size x Tokens Amount x head_size]
        self.Ln2                 = nn.LayerNorm(self.embed_size)  # Size: [Batch Size x Tokens Amount x head_size]
        self.Ln3                 = nn.LayerNorm(self.embed_size)  # Size: [Batch Size x Tokens Amount x head_size]

    def forward(self, captionPositionedEmbeddings, encoderK, encoderV, crossMask=None, mask=None):

        captionQ = captionK = captionV = self.Ln1(captionPositionedEmbeddings)
        captionAttentions  = captionPositionedEmbeddings + self.MultiAttentionHeads(captionQ, captionK, captionV, dont_look_ahead = True, mask=mask) # Size: [Batch Size x Tokens Amount x embed_size]. Apply MultiHead Attention Layer

        decoderQ = self.Ln2(captionAttentions)
        encoderK = self.Ln2(encoderK) 
        encoderV = self.Ln2(encoderV)

        mergedAttentions  = decoderQ + self.MultiAttentionHeads(decoderQ, encoderK, encoderV, dont_look_ahead = False, mask=crossMask)
        feedForward = mergedAttentions + self.FeedForward(self.Ln3(mergedAttentions))       # Size: [Batch Size x Tokens Amount x embed_size]
        return feedForward                                                                  # Size: [Batch Size x Tokens Amount x embed_size]

class EncoderBlock(nn.Module):
    def __init__(self,  embed_size, num_heads, head_size, dropout, device):
        super().__init__()
        self.embed_size = embed_size
        self.num_heads  = num_heads
        self.head_size  = head_size
        self.device     = device

        self.MultiAttentionHeads = MultiHeadAttention(self.embed_size, self.num_heads, self.head_size, dropout, self.device) # Size: [Batch Size x Tokens Amount x head_size]
        self.FeedForward         = FeedForward(self.embed_size, dropout)   # Size: [Batch Size x Tokens Amount x head_size]
        self.Ln1                 = nn.LayerNorm(self.embed_size)  # Size: [Batch Size x Tokens Amount x head_size]
        self.Ln2                 = nn.LayerNorm(self.embed_size)  # Size: [Batch Size x Tokens Amount x head_size]

    def forward(self, positionedEmbeddings, mask=None):
        imageQ = imageK = imageV  = self.Ln1(positionedEmbeddings)
        attentions  = positionedEmbeddings + self.MultiAttentionHeads(imageQ, imageK, imageV, dont_look_ahead = False, mask=None) # Size: [Batch Size x Tokens Amount x embed_size]. Apply MultiHead Attention Layer
        feedForward = attentions + self.FeedForward(self.Ln2(attentions))                                                         # Size: [Batch Size x Tokens Amount x embed_size]
        return feedForward           

class Encoder(nn.Module):
    def __init__(self, num_blocks, num_heads, embed_size, head_size, image_side, patch_side, dropout, device):
        super().__init__()
        self.device                 = device
        self.embed_size             = embed_size
        self.num_blocks             = num_blocks
        self.num_heads              = num_heads
        self.head_size              = head_size
        self.image_side             = image_side
        self.patch_side             = patch_side
        
        pathcesEmbeddingInputSize = patch_side**2 * 3
        self.pathcesEmbedding = nn.Linear(pathcesEmbeddingInputSize, embed_size)
        self.EncoderBlocks = nn.ModuleList([
            EncoderBlock(self.embed_size, self.num_heads, self.head_size, dropout, self.device) for _ in range(self.num_blocks)
        ])
        self.final_layer_norm = nn.LayerNorm(self.embed_size)
    
    def positionEncoding(self, input_tokens_amount):
        positionEncoding = torch.zeros(input_tokens_amount, self.embed_size, device = self.device)                                       # max length x embedding dimmensions equivalent to Size: [input_tokens_amount x embed_size]
        positions = torch.arange(0, input_tokens_amount, dtype=torch.float, device = self.device).unsqueeze(1)                           # Tensor [0, 1, 2,..., input_tokens_amount] -> [⋮] : rotated for each value in separate row of 1 column
        div_term = torch.exp(torch.arange(0, self.embed_size, 2, device = self.device).float() * (-math.log(10000.0) / self.embed_size)) # Tensor [0, 2, 4,..., embed_size] x (-math.log(10000.0) / self.embed_size) --> exponenta

        positionEncoding[:, 0::2] = torch.sin(positions * div_term)        # Size: [input_tokens_amount x embed_size], set the odd values (columns 1 and 3) 
        positionEncoding[:, 1::2] = torch.cos(positions * div_term)        # Size: [input_tokens_amount x embed_size], set the even values (columns 2 and 4) 
 
        return positionEncoding.unsqueeze(0)                               # Size: [1 (for batch dim) x input_tokens_amount x embed_size]
     
    def forward(self, imagesTokens):
        # imagesTokens   Size: [Batch Size x Image Tokens Amount x patch_side**2 * 3]       
        
        batchSize, imageTokensAmount, _ = imagesTokens.shape
        imageEmbedding = self.pathcesEmbedding(imagesTokens)                                   # Size [Batch Size  x Image Tokens Amount x embed_size]
        imagePositionedEmbeddings = imageEmbedding + self.positionEncoding(imageTokensAmount)  # Size [Batch Size  x Image Tokens Amount x embed_size]

        imageBlocks = imagePositionedEmbeddings
        for block in self.EncoderBlocks:
            imageBlocks = block(imageBlocks, mask = None)  # Size: [Batch Size x Image Tokens Amount x embed_size]
        encoderOut = self.final_layer_norm(imageBlocks)    # Size: [Batch Size x Image Tokens Amount x embed_size]

        return encoderOut                                  # Size: [Batch Size x Image Tokens Amount x embed_size]

class Decoder(nn.Module):
    def __init__(self, num_blocks, num_heads, embed_size, head_size,  vocab_size, dropout, device):
        super().__init__()
        self.device                 = device
        self.num_blocks             = num_blocks
        self.embed_size             = embed_size
        self.vocab_size             = vocab_size
        self.num_heads              = num_heads
        self.head_size              = head_size

        self.captionEmbedding = torch.nn.Embedding(num_embeddings = self.vocab_size, embedding_dim = self.embed_size, device = self.device) # Size: [vocab_size x embed_size]
        self.DecoderBlocks = nn.ModuleList([
            DecoderBlock(self.embed_size, self.num_heads, self.head_size, dropout, self.device) for _ in range(self.num_blocks)
        ])
        self.decoder_final_layer_norm = nn.LayerNorm(self.embed_size)
        self.LangModelHead  = nn.Linear(self.embed_size, self.vocab_size, device = self.device) # Size: [embed_size x vocab_size]
    
    def positionEncoding(self, input_tokens_amount):
        positionEncoding = torch.zeros(input_tokens_amount, self.embed_size, device = self.device)                                  # max length x embedding dimmensions equivalent to Size: [input_tokens_amount x embed_size]
        positions = torch.arange(0, input_tokens_amount, dtype=torch.float, device = self.device).unsqueeze(1)                      # Tensor [0, 1, 2,..., input_tokens_amount] -> [⋮] : rotated for each value in separate row of 1 column
        div_term = torch.exp(torch.arange(0, self.embed_size, 2, device = self.device).float() * (-math.log(10000.0) / self.embed_size)) # Tensor [0, 2, 4,..., embed_size] x (-math.log(10000.0) / self.embed_size) --> exponenta

        positionEncoding[:, 0::2] = torch.sin(positions * div_term)        # Size: [input_tokens_amount x embed_size], set the odd values (columns 1 and 3) 
        positionEncoding[:, 1::2] = torch.cos(positions * div_term)        # Size: [input_tokens_amount x embed_size], set the even values (columns 2 and 4) 
 
        return positionEncoding.unsqueeze(0)                               # Size: [1 (for batch dim) x input_tokens_amount x embed_size]
     
    def forward(self, captionsInputs, encoderK, encoderV):
        # captionsInputs Size: [Batch Size x Tokens Amount]
    
        batchSize, captionTokensAmount = captionsInputs.shape

        caption_padding_lables = (captionsInputs != 0).float().to(device)
        caption_padding_mask = caption_padding_lables.unsqueeze(-1) @ caption_padding_lables.unsqueeze(-2)

        batchSize, patchesAmount, _ = encoderK.shape
        
        image_padding_label = torch.ones(batchSize, patchesAmount, dtype=torch.float).to(device)

        cross_padding_mask = caption_padding_lables.unsqueeze(-1) @ image_padding_label.unsqueeze(-2)

        captionEmbeddings = self.captionEmbedding(captionsInputs)                              # Size: [Batch Size x Tokens Amount x embed_size]
        positionedEmbeddings = captionEmbeddings + self.positionEncoding(captionTokensAmount)  # Size: [Batch Size x Tokens Amount x embed_size]
        
        decoderInput = positionedEmbeddings
        for block in self.DecoderBlocks:
            decoderInput = block(decoderInput, encoderK, encoderV, crossMask = cross_padding_mask, mask = caption_padding_mask)        # Size: [Batch Size x Tokens Amount x embed_size]
        decoderOut = self.decoder_final_layer_norm(decoderInput)
        
        logits = self.LangModelHead(decoderOut)                            # Size: [Batch Size x Tokens Amount x vocab_size]
        return logits                                                      # Size: [Batch Size x Tokens Amount x vocab_size]

class captionTransformer(nn.Module):
    def __init__(self, num_blocks, num_heads, embed_size, head_size, vocab_size, image_side, patch_side, dropout, device):
        super().__init__()
        self.device                 = device
        self.num_blocks             = num_blocks
        self.embed_size             = embed_size
        self.vocab_size             = vocab_size
        self.num_heads              = num_heads
        self.head_size              = head_size
        self.image_side             = image_side
        self.patch_side             = patch_side
        
        self.Encoder = Encoder(num_blocks, num_heads, embed_size, head_size, image_side, patch_side, dropout, device)
        self.Decoder = Decoder(num_blocks, num_heads, embed_size, head_size, vocab_size, dropout, device)
      
    def forward(self, imagesTokens, captionsInputs):
        # captionsInputs Size: [Batch Size x Tokens Amount]
        # imagesTokens   Size: [Batch Size x Image Tokens Amount x patch_side**2 * 3]       
        
        # print("Encoder:")
        encoderK = encoderV = self.Encoder(imagesTokens)            # Size: [Batch Size x Image Tokens Amount x embed_size]
        # print("Decoder:")
        logits = self.Decoder(captionsInputs, encoderK, encoderV)  # Size: [Batch Size x Caption Tokens Amount x vocab_size]
        return logits                                               # Size: [Batch Size x Caption Tokens Amount x vocab_size]

In [7]:
# Embedd = nn.Embedding(10, 2)
# input_indices = torch.tensor([
#     [1, 1, 1, 1],
#     [1, 1, 1, 0],  # Using '0' as padding
#     [1, 0, 0, 0]   # More padding examples
# ])

# print("Input:\n",input_indices.shape)
# print(input_indices, "\n")

# padding_lable = (input_indices != 0).int()
# print("Padding mask:\n", padding_lable.shape)
# print(padding_lable, "\n")

# padding_mask_A = padding_lable.unsqueeze(-1)
# print("unsqueeze(-1) Padding mask:\n", padding_mask_A.shape)
# padding_mask_B = padding_lable.unsqueeze(-2)
# print("unsqueeze(-2) Padding mask:\n", padding_mask_B.shape)

# padding_mask = padding_mask_A @ padding_mask_B

# print("Dot product Padding mask:\n", padding_mask.shape)
# print(padding_mask, "\n")

# embedded = Embedd(input_indices)
# print("embedded input:\n", embedded.shape)
# # print(embedded, "\n")

# attention = embedded @ embedded.transpose(-2, -1)
# print("embedded.transpose(-2, -1) :\n",embedded.transpose(-2, -1).shape)
# print("attention :\n",attention.shape)
# print(attention)

# attention = attention.masked_fill(padding_mask == 0, float('-inf')) 
# print("padded attention :\n", attention.shape)
# print(attention)
# soft  = F.softmax(attention, dim=-1) 
# print("padded attention soft :\n", soft.shape)
# print(soft)

In [8]:
# print(torch.tril(torch.ones(2, 2)))

# p = torch.tensor([1,1,0,0]).float()
# p1 = p.unsqueeze(-1)
# p2 = p.unsqueeze(-2)
# print(p)
# print(p1)
# print(p2)
# mask = p1 @ p2
# print(mask)

Parameters

In [9]:

batch_size = 32
learning_rate = 3e-5
dropout = 0.2
maxNewTokens = 100
vocab_size = sp.GetPieceSize()
embed_size = 512 # 512
num_blocks = 6
num_heads = 8 # 8
head_size = int(embed_size / num_heads)
print("head_size: ", head_size)
if embed_size % num_heads != 0:
    print("embed_size Cannot be divided evenly by num_heads.")
    sys.exit()
image_side = 256
patch_side = 16
if image_side % patch_side != 0:
    print("image_side Cannot be divided evenly by patch_side.")
    sys.exit()

data = [
    # '../dataset/0000.parquet',
    # '../dataset/0001.parquet',
    # '../dataset/0002.parquet',
    '../dataset/0003.parquet',
    '../dataset/0004.parquet',
    # '../dataset/0005.parquet',
    # '../dataset/0006.parquet',
    # '../dataset/0007.parquet',
]

validation_data = [
    '../dataset/0001.parquet',
]

head_size:  64


Model and Dataloader

In [10]:
model = captionTransformer(
    embed_size          = embed_size, 
    num_blocks          = num_blocks,
    num_heads           = num_heads,
    head_size           = head_size, 
    vocab_size          = vocab_size,
    image_side          = image_side,
    patch_side          = patch_side,
    dropout             = dropout,
    device              = device,
)
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [11]:
import datetime
datasets = []
for file in data:
    print("File started : ", file, datetime.datetime.now())
    captionDataset = CaptionDataset(file, image_side, patch_side, sp, device)
    datasets.append(captionDataset)
    print("Preprocessing finished: ", datetime.datetime.now())

File started :  ../dataset/0003.parquet 2024-04-12 14:00:57.012237
Preprocessing finished:  2024-04-12 14:03:15.393540
File started :  ../dataset/0004.parquet 2024-04-12 14:03:15.393784
Preprocessing finished:  2024-04-12 14:05:26.083548


In [None]:


epochs = 15
print("Training started at:", datetime.datetime.now())

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for ds in datasets:
        print("Dataloader started : ", datetime.datetime.now())
        captionDataloader = DataLoader(ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn,)
        print("Dataloader finished: ", datetime.datetime.now())
        for batch_idx, (image, imagesTokens, captionsInputs, captionsTargets) in enumerate(captionDataloader):
            logits  = model(imagesTokens, captionsInputs)                     # Size: [Batch Size x Caption Tokens Amount x vocab_size]
            
            batchSize, captionTokensAmount, vocabSize = logits.shape
            
            logits  = logits.view(batchSize * captionTokensAmount, vocabSize) # Size: [(Batch Size * captionTokensAmount) x Vocab Size]
            targets = captionsTargets.view(batchSize * captionTokensAmount)   # Size: [(Batch Size * captionTokensAmount)]
            
            mask = targets != 0  # Assuming -1 is used for padding in labels
            loss = F.cross_entropy(logits, targets, reduction='none')
            loss = loss * mask.view(batchSize * captionTokensAmount).float()
            loss = loss.sum() / mask.sum()
            running_loss += loss.item()
            
            # loss = F.cross_entropy(logits, targets)
            optimizer.zero_grad(set_to_none = True)
            # optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if (batch_idx+1) % 10 == 0:  # Print loss every 100 batches
                print(f'[{epoch + 1}, {batch_idx + 1}] loss: {running_loss / 10:.3f}')
                running_loss = 0.0
                # print(f'Epoch: {epoch}, Batch: {batch_idx}, Loss: {loss.item()}')
    
    # Validation loop
    model.eval()  # Set the model to evaluation mode
    for file in validation_data:
        print("Eval started : ", file, datetime.datetime.now())
        valid_captionDataset = CaptionDataset(file, image_side, patch_side, sp, device, True)
        valid_captionDataloader = DataLoader(valid_captionDataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn,)
        print("Eval preprocess finished : ", datetime.datetime.now())
        val_running_loss = 0.0
        with torch.no_grad():  # No gradient is needed for validation
            for batch_idx, (image, imagesTokens, captionsInputs, captionsTargets) in enumerate(valid_captionDataloader):
                logits  = model(imagesTokens, captionsInputs)                     # Size: [Batch Size x Caption Tokens Amount x vocab_size]
            
                batchSize, captionTokensAmount, vocabSize = logits.shape
                
                logits  = logits.view(batchSize * captionTokensAmount, vocabSize) # Size: [(Batch Size * captionTokensAmount) x Vocab Size]
                targets = captionsTargets.view(batchSize * captionTokensAmount)   # Size: [(Batch Size * captionTokensAmount)]
                
                mask = targets != 0  # Assuming -1 is used for padding in labels
                loss = F.cross_entropy(logits, targets, reduction='none')
                loss = loss * mask.view(batchSize * captionTokensAmount).float()
                loss = loss.sum() / mask.sum()
                val_running_loss += loss.item()

        val_loss = val_running_loss / len(valid_captionDataloader)
        print(f'Epoch {epoch + 1} validation loss: {val_loss:.3f}')
print("finish")

Training started at: 2024-04-12 14:05:26.102089
Dataloader started :  2024-04-12 14:05:26.107727
Dataloader finished:  2024-04-12 14:05:26.108140
[1, 10] loss: 9.095
[1, 20] loss: 8.614
[1, 30] loss: 8.210
[1, 40] loss: 7.836
[1, 50] loss: 7.598
[1, 60] loss: 7.232
[1, 70] loss: 7.141
[1, 80] loss: 6.852
[1, 90] loss: 6.776
[1, 100] loss: 6.537
[1, 110] loss: 6.445
[1, 120] loss: 6.320


In [None]:
import random
with torch.no_grad():
    model.eval()
    
    for file in validation_data:
        valid_captionDataset = CaptionDataset(file, image_side, patch_side, sp, device, True)
        for attempt in range(5):
            random_index = random.randint(0, len(valid_captionDataset) - 1)
            image, imagesTokens, captionsInputs, captionsTargets = valid_captionDataset[random_index]
            
            nparr = np.frombuffer(image['bytes'], np.uint8)
            image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
            plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))  # Convert BGR to RGB
            plt.axis('off')  # Hide axis
            plt.show()
    
            for J in range(3):
                startCaption = "<s>"
                startTokensIds = sp.EncodeAsIds(startCaption) 
                startTokensTensor = torch.tensor(startTokensIds, dtype=torch.long, device = device).unsqueeze(0)   # [1 x int, ..., tokens_length] 
                finalCaptionTokensIds = startTokensIds
                
                for i in range(maxNewTokens):
    
                    genLogits  = model(imagesTokens.unsqueeze(0), startTokensTensor)               # Size: [Batch Size x Caption Tokens Amount x vocab_size]
                    # Let's focus only on last token in sequence
                    genLogits = genLogits[:, -1, :]                                 # Size: [Batch Size x Vocab Size]  
                    probabilities = F.softmax(genLogits, dim=-1)                    # Size: [Batch Size x Vocab Size], Probavilities of each word from vocab
                    nextIdx = torch.multinomial(probabilities, num_samples = 1)     # Size: [Batch Size x 1]
                    
                    # apply running index to the running sequence 
                    startTokensTensor = torch.cat((startTokensTensor, nextIdx), dim=1) # Size: [Batch Size x (Tokens Amount + 1)]
                    finalCaptionTokensIds.append(nextIdx.item())
                    finalStoryTokens = []
                    for tokenId in finalCaptionTokensIds:
                        finalStoryTokens.append(sp.IdToPiece(tokenId))
               
                finalStory = ''.join(finalStoryTokens).replace('▁', ' ').strip()  # Assuming '▁' is the SentencePiece underline character
                print("Caption #", J, ": ", finalStory, "\n\n")

       
                
        
        
                
                

    model.train()