# Import the Dataset
#### Uplaoded the zipped dataset into google drive and did mount drive to access the dataset from colab

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


#### Stored the extracted data in separate folder called flikr

In [2]:
import zipfile
import os

zip_ref = zipfile.ZipFile('/content/drive/MyDrive/archive2.zip', 'r') #Opens the zip file in read mode
zip_ref.extractall('/content/flikr') #Extracts the files into the /tmp folder
zip_ref.close()

In [3]:
len(os.listdir('/content/flikr/images/'))

8091

#### Importing the required libraries

In [4]:
import pandas as pd
from pathlib import Path

In [5]:
# create an empty dataframe with 'imgs' column
df = pd.DataFrame(columns=['imgs'])


# we will store the image files and captions here before putting it into dataframe
imgs, captions = [], []

In [6]:
with open("/content/flikr/captions.txt", "r") as f:
    content = f.readlines()

In [7]:
content[0]

'image_name|caption_number|caption_text\n'

In [8]:
len(content)

40456

In [9]:
root_dir = Path("/content/flikr")

#### Extracting the first caption of the image and mapping the caption to the image's location. finally storing it in the dataframe

In [10]:
for line in content:
    line = line.strip().split("|")

    # extract the required informations
    img_path = line[0]
    caption_number = line[1]
    caption = line[-1]

    # check if the caption_number is equal to 1
    if caption_number == '1':
        # store the image path
        imgs.append(root_dir/"images"/img_path)
        # store the caption
        captions.append(caption)

In [11]:
df.loc[:, 'imgs'] = imgs
df.loc[:, 'captions'] = captions

In [12]:
len(df)

8091

#### Install transformers for the usage of GPT2 transformer to initialize the weights

In [13]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m72.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m91.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


#### Tokenizer in NLP: The AutoTokenizer in NLP processes text by tokenizing it, converting it into numerical representations, and handling preprocessing steps. It prepares the text in a format suitable for direct input to the model.

#### Feature Extractor in CV: The feature extractor in computer vision processes an image to prepare it for the vision model. It transforms the image into a format appropriate for direct input, extracting relevant features that the model can utilize.

In [14]:
from transformers import AutoFeatureExtractor, AutoTokenizer

encoder_checkpoint = "google/vit-base-patch16-224-in21k"
decoder_checkpoint = "gpt2"

feature_extractor = AutoFeatureExtractor.from_pretrained(encoder_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)

Downloading (…)rocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]



Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [15]:
tokenizer.pad_token = tokenizer.eos_token

#### Let’s take an example image and a caption to see applying feature_extractor and tokenizer

In [16]:
from PIL import Image

# maximum length for the captions
max_length = 128
sample = df.iloc[0]

# sample image
image = Image.open(sample['imgs']).convert('RGB')
# sample caption
caption = sample['captions']

# apply feature extractor on the sample image
inputs = feature_extractor(images=image, return_tensors='pt')
# apply tokenizer
outputs = tokenizer(
            caption, 
            max_length=max_length, 
            truncation=True, 
            padding='max_length',
            return_tensors='pt',
        )

#### Now, let’s write a normal dataset loading class as we usually do in pytorch. We will pass the dataframe we just created and extract the images and captions. Then for each each image and caption we apply the feature_extractor and tokenizer.

In [17]:
from torch.utils.data import Dataset

class LoadDataset(Dataset):
    def __init__(self, df):
        self.images = df['imgs'].values
        self.captions = df['captions'].values
    
    def __getitem__(self, idx):
        # everything to return is stored inside this dict
        inputs = dict()

        # load the image and apply feature_extractor
        image_path = str(self.images[idx])
        image = Image.open(image_path).convert("RGB")
        image = feature_extractor(images=image, return_tensors='pt')

        # load the caption and apply tokenizer
        caption = self.captions[idx]
        labels = tokenizer(
            caption, 
            max_length=max_length, 
            truncation=True, 
            padding='max_length',
            return_tensors='pt',
        )['input_ids'][0]
        
        # store the inputs(pixel_values) and labels(input_ids) in the dict we created
        inputs['pixel_values'] = image['pixel_values'].squeeze()   
        inputs['labels'] = labels
        return inputs
    
    def __len__(self):
        return len(self.images)


#### Let’s split our dataframe into training and testing set

In [18]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)

#### Now we will load each image and caption of our dataset using the LoadDataset class

In [19]:
train_ds = LoadDataset(train_df)
test_ds = LoadDataset(test_df)

In [20]:
dir(test_ds)

['__add__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_is_protocol',
 'captions',
 'images']

In [21]:
test_ds.images

array([PosixPath('/content/flikr/images/3139895886_5a6d495b13.jpg'),
       PosixPath('/content/flikr/images/3133825703_359a0c414d.jpg'),
       PosixPath('/content/flikr/images/244910177_7c4ec3f65b.jpg'), ...,
       PosixPath('/content/flikr/images/3198247669_7493af04a7.jpg'),
       PosixPath('/content/flikr/images/253762507_9c3356c2f6.jpg'),
       PosixPath('/content/flikr/images/3547704737_57d42d5d9d.jpg')],
      dtype=object)

In [22]:
test_ds.images[10]

PosixPath('/content/flikr/images/3676561090_9828a9f6d0.jpg')

In [23]:
test_ds[10]

{'pixel_values': tensor([[[-0.2471, -0.4039, -0.2863,  ..., -0.4980, -0.5765, -0.5294],
          [-0.0980, -0.2706, -0.1686,  ..., -0.5216, -0.5059, -0.3255],
          [-0.1608, -0.2627, -0.1765,  ..., -0.4667, -0.4510, -0.2157],
          ...,
          [-0.5843, -0.6235, -0.6392,  ..., -0.4196, -0.4745, -0.5451],
          [-0.5451, -0.5608, -0.6314,  ..., -0.3961, -0.4745, -0.5765],
          [-0.4353, -0.5216, -0.5765,  ..., -0.4039, -0.4510, -0.6078]],
 
         [[-0.1608, -0.2627, -0.1529,  ..., -0.3804, -0.4667, -0.4196],
          [-0.0118, -0.1529, -0.0588,  ..., -0.4196, -0.4196, -0.2706],
          [-0.0824, -0.1608, -0.0588,  ..., -0.4039, -0.3882, -0.2000],
          ...,
          [-0.4980, -0.5608, -0.6392,  ..., -0.3961, -0.3804, -0.4431],
          [-0.5137, -0.5216, -0.6314,  ..., -0.3490, -0.4118, -0.4824],
          [-0.4431, -0.5216, -0.5922,  ..., -0.3725, -0.4196, -0.5294]],
 
         [[-0.2784, -0.4039, -0.3255,  ..., -0.5294, -0.5843, -0.5294],
          [-

In [None]:
test_ds.images[65]

PosixPath('/content/flikr/images/2314722788_6262c3aa40.jpg')

# Fine Tuning the Model (Training)
#### Now let's the pretrained model VisionEncoderDecoderModel where we can pass the name of the vision model and the language model we need for the encoder and the decoder respectively

In [24]:
from transformers import VisionEncoderDecoderModel

model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder_checkpoint, 
    decoder_checkpoint
)

Downloading pytorch_model.bin:   0%|          | 0.00/346M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.10.crossattention.c_attn.weight', 'h.4.crossattention.masked_bias', 'h.3.crossattention.bias', 'h.9.ln_cross_attn.weight', 'h.10.crossattention.masked_bias', 'h.3.ln_cross_attn.weight', 'h.11.crossattention.q_attn.weight', 'h.6.crossattention.c_proj.weight', 'h.7.crossattention.c_proj.weight', 'h.0.crossattention.c_proj.bias', 'h.7.crossattention.masked_bias', 'h.1.crossattention.bias', 'h.2.crossattention.q_attn.weight', 'h.0.crossattention.q_attn.weight', 'h.7.ln_cross_attn.weight', 'h.2.ln_cross_attn.weight', 'h.0.crossattention.masked_bias', 'h.1.crossattention.masked_bias', 'h.5.crossattention.c_attn.weight', 'h.7.crossattention.c_proj.bias', 'h.10.crossattention.c_proj.bias', 'h.1.ln_cross_attn.weight', 'h.5.crossattention.masked_bias', 'h.10.crossattention.c_proj.weight', 'h.1.crossattention.c_attn.weight', 'h.9.crossattention.bias', 'h.8.crossattention.c_attn.we

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

#### Set a value for decoder_start_token_id and pad_token_id in the model config

In [25]:
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

#### Inorder to get good and meaningful captions from the model, we should use beam search instead of greedy search while generating the caption. For that, we just have to set a value greater than 1 for num_beams in model config

In [26]:
# set number of beams for beam search to 4
num_beams = 4
model.config.num_beams = num_beams

In [27]:
!pip install transformers accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.19.0


#### We need to pass some arguments to control the training of our model.

In [28]:
from transformers import Seq2SeqTrainingArguments

# batch size
bs = 8

training_args = Seq2SeqTrainingArguments(
    output_dir="image-caption-generator", # name of the directory to store training outputs
    evaluation_strategy="epoch",          # evaluate after each epoch
    per_device_train_batch_size=bs,       # batch size during training
    per_device_eval_batch_size=bs,        # batch size during evaluation
    learning_rate=5e-5,
    weight_decay=0.01,                    # weight decay parameter for AdamW optimizer
    num_train_epochs=5,                   # number of epochs to train
    save_strategy='epoch',                # save checkpoints after each epoch
    report_to='none',                     # prevent reporting to wandb, mlflow...
)

#### We will use the Seq2SeqTrainer from transformers library.
#### Now let’s create the trainer and start the training.

In [29]:
from transformers import Seq2SeqTrainer, default_data_collator

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=feature_extractor,
    data_collator=default_data_collator,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    args=training_args,
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,0.3353,0.255929
2,0.2352,0.247374
3,0.2124,0.247474
4,0.1843,0.252281
5,0.1654,0.258532


TrainOutput(global_step=4045, training_loss=0.22018022867304138, metrics={'train_runtime': 3690.0642, 'train_samples_per_second': 8.769, 'train_steps_per_second': 1.096, 'total_flos': 5.839811516647342e+18, 'train_loss': 0.22018022867304138, 'epoch': 5.0})

# Testing the model

In [30]:
import torch

In [31]:
model.eval()

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_featur

In [34]:
import warnings
warnings.filterwarnings("ignore")

In [37]:

for i in range(51):
    inputs = test_ds[i]['pixel_values']
    with torch.no_grad():
        # uncomment the below line if feature extractor is not applied to the image already
        # inputs = feature_extractor(images=inputs, return_tensors='pt').pixel_values

        # generate caption for the image
        out = model.generate(
            inputs.unsqueeze(0).to('cuda'), # move inputs to GPU
            num_beams=num_beams, 
            )

    # convert token ids to string format
    decoded_out = tokenizer.decode(out[0], skip_special_tokens=True)

    print("Image_URL: ", test_ds.images[i])
    print("Model_Generated_Caption: ", decoded_out)
    print("Actual_Caption: ", test_ds.captions[i], end = '\n\n\n')

Image_URL:  /content/flikr/images/3139895886_5a6d495b13.jpg
Model_Generated_Caption:  A black and white dog is running through tall grass.
Actual_Caption:  A black and white dog is jumping over high yellow grass .


Image_URL:  /content/flikr/images/3133825703_359a0c414d.jpg
Model_Generated_Caption:  A boy in a red hat and a boy in a blue hat are standing in front of a
Actual_Caption:  A father and son looking at a funny looking Santa .


Image_URL:  /content/flikr/images/244910177_7c4ec3f65b.jpg
Model_Generated_Caption:  A woman in a white t-shirt is talking to a man in a black t-shirt
Actual_Caption:  Three women , two with tattoos , walking down the street


Image_URL:  /content/flikr/images/2127207912_9298824e66.jpg
Model_Generated_Caption:  A man climbs a rock while another watches.
Actual_Caption:  Three people make their way through rocky terrain .


Image_URL:  /content/flikr/images/1810651611_35aae644fb.jpg
Model_Generated_Caption:  A little girl is swinging on a swing set.
Ac

#### We have tested the 51 samples of test dataset, out of which 2 captions were generated with an incomplete sentence and 5 captions were generated with context barely matching with the actual caption.