# 1. ViT Vision Transformer
![](https://production-media.paperswithcode.com/methods/Screen_Shot_2021-01-26_at_9.43.31_PM_uI4jjMq.png)
> The Vision Transformer, or ViT, is a model for image classification that employs a Transformer-like architecture over patches of the image. An image is split into fixed-size patches, each of them are then linearly embedded, position embeddings are added, and the resulting sequence of vectors is fed to a standard Transformer encoder. In order to perform classification, the standard approach of adding an extra learnable “classification token” to the sequence is used.

# 2. GPT2
![](https://jalammar.github.io/images/gpt2/gpt-2-layers-2.png)
> GPT-2 is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. inputs are sequences of continuous text of a certain length and the targets are the same sequence, shifted one token (word or piece of word) to the right. The model uses internally a mask-mechanism to make sure the predictions for the token i only uses the inputs from 1 to i but not the future tokens.

# Import

In [1]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from pathlib import Path
from tqdm.auto import tqdm
import multiprocessing as mp
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import io, transforms
from torch.utils.data import Dataset, DataLoader, random_split

# hugginface
import datasets  # https://pypi.org/project/datasets/
from transformers import (
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    VisionEncoderDecoderModel,
    ViTImageProcessor,
    # ViTFeatureExtractor,
    AutoTokenizer,
    GPT2Config,
    default_data_collator
)

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"There are {torch.cuda.device_count()} GPU(s) available")
#     print(f"We will use the GPU: {torch.cuda.get_device_name(0)}")
    print("The GPU(s) are as follows:")
    for each in range(torch.cuda.device_count()):
        print(f"{each} : {torch.cuda.get_device_properties(each).name}")
else:
    print("No GPU available, usig the CPU instead.")
    device = torch.device("CPU")

There are 2 GPU(s) available
The GPU(s) are as follows:
0 : NVIDIA GeForce GTX 1080 Ti
1 : NVIDIA GeForce GTX 1080 Ti


# Parameters

In [3]:
os.environ["WANDB_DISABLED"] = "true"
class config: 
    ENCODER = "google/vit-base-patch16-224"
    # ENCODER = "google/vit-base-patch16-224-in21k"
    DECODER = 'gpt2'
    TRAIN_BATCH_SIZE = 8
    VAL_BATCH_SIZE = 1
    VAL_EPOCHS = 1
    LR = 5e-5
    SEED = 42
    MAX_LEN = 128
    SUMMARY_LEN = 20
    WEIGHT_DECAY = 0.01
    MEAN = (0.485, 0.456, 0.406)
    STD = (0.229, 0.224, 0.225)
    TRAIN_PCT = 0.95
    NUM_WORKERS = mp.cpu_count() # number of logical CPU cores
    EPOCHS = 3
    IMG_SIZE = (224, 224)
    LABEL_MASK = -100
    TOP_K = 1000
    TOP_P = 0.95 
    CAPTION = "/media/dllab/AI-PhD-Study/test-dataset/imgcap/flickr8k/captions.txt"
    IMAGE_DIR = "/media/dllab/AI-PhD-Study/test-dataset/imgcap/flickr8k/Images"

# Helper functions

In [4]:
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
    """
    bos - begin of special 
    eos - end of special 
    """
    outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
    return outputs

AutoTokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens

In [5]:
rouge = datasets.load_metric("rouge", trust_remote_code=True)

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    # The tokenizer here is actually the feature extractor which is very likely a pretrained model
    # The batch_decode methods is used to convert a batch of tokenized sequences back into human-redable txt
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id

    rouge_output = rouge.compute(predictions=pred_str, references=label_str,
                                rouge_types=["rouge2"])["rouge2"].mid
    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_ouput.fmeasure, 4),
    }

  rouge = datasets.load_metric("rouge", trust_remote_code=True)


# Dataset

In [6]:
# Feature extractor and tokenizer
# feature_extractor = ViTFeatureExtractor.from_pretrained(config.ENCODER)
feature_extractor = ViTImageProcessor.from_pretrained(config.ENCODER)
tokenizer = AutoTokenizer.from_pretrained(config.DECODER)
tokenizer.pad_token = tokenizer.unk_token

In [7]:
# Transforms and dataframe
# img 224,224
# normalization
# converting img to tensor

transforms = transforms.Compose(
    [
        transforms.Resize(config.IMG_SIZE),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=.5,
            std=.5
        )
    ]
)

In [8]:
df = pd.read_csv(config.CAPTION)
train_df, val_df = train_test_split(df, test_size=0.2)
df.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [9]:
# Dataset class

The dataset is created following these steps: 
> - read the image using the Image function of PIL library
>- The image is transformed using the transformed defined above
>- The transformed image is passed through the feature extractor to extract the pixel values from the image
>- The captions are loaded from the dataframe
>- The captions are tokenized
>- The tokenized captions are padded to max length
>- The images and tokenized captions are returned

In [10]:
class ImgDataset(Dataset):
    def __init__(self, df, root_dir, tokenizer, feature_extractor, transform=None):
        self.df = df
        self.transform = transform
        self.root_dir = root_dir
        self.tokenizer = tokenizer
        self.feature_extractor = feature_extractor
        self.max_length = 50

    def __len__(self,):
        return len(self.df)

    def __getitem__(self, idx):
        # choosing the img and caption name along column index
        caption = self.df.caption.iloc[idx]
        image = self.df.image.iloc[idx]
        img_path = os.path.join(self.root_dir, image)
        img = Image.open(img_path).convert("RGB")

        if self.transform is not None:
            # using pytorch transform to process the image which loaded by PIL Image
            img = self.transform(img)

        # Generate image and caption embedding
        pixel_values = self.feature_extractor(img, return_tensors="pt").pixel_values
        
        captions = self.tokenizer(
            caption,
            padding="max_length",
            max_length=self.max_length    
        ).input_ids

        # Filter captions
        # this filtering step ensures that padding tokens within the captions are replaced 
        # with a specific value (here, -100) to prevent them from affecting the model's training.
        captions = [
            caption if caption != self.tokenizer.pad_tokenizer.pad_token_id else -100 for caption in captions    
        ]

        # Combine image and caption embedding into a dict 
        encoding = {
            "pixel_values": pixel_values.squeeze(), 
            "labels": torch.tensor(captions)   
        }
        
        return encoding

In [11]:
# Train and validation dataset
train_dataset = ImgDataset(
    train_df, 
    root_dir = config.IMAGE_DIR,
    tokenizer = tokenizer,
    feature_extractor=feature_extractor,
    transform=transforms
)

val_dataset = ImgDataset(
    val_df,
    root_dir=config.IMAGE_DIR,
    tokenizer=tokenizer,
    feature_extractor=feature_extractor,
    transform=transforms
)

# Model Building

In [12]:
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(config.ENCODER, config.DECODER)

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.crossattention.c_attn.bias', 'h.0.crossattention.c_attn.weight', 'h.0.crossattention.c_proj.bias', 'h.0.crossattention.c_proj.weight', 'h.0.crossattention.q_attn.bias', 'h.0.crossattention.q_attn.weight', 'h.0.ln_cross_attn.bias', 'h.0.ln_cross_attn.weight', 'h.1.crossattention.c_attn.bias', 'h.1.crossattention.c_attn.weight', 'h.1.crossattention.c_proj.bias', 'h.1.crossattention.c_proj.weight', 'h.1.crossattention.q_attn.bias', 'h.1.crossattention.q_attn.weight', 'h.1.ln_cross_attn.bias', 'h.1.ln_cross_attn.weight', 'h.10.crossattention.c_attn.bias', 'h.10.crossattention.c_attn.

In [14]:
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size
# set beam search parameters
model.config.eos_token_id = tokenizer.sep_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.max_length = 128
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

# Training

In [None]:
# Training arguments

training_args = Seq2SeqTrainingArguments(
    
)