<!-- # 1. ViT Vision Transformer
![](https://production-media.paperswithcode.com/methods/Screen_Shot_2021-01-26_at_9.43.31_PM_uI4jjMq.png)
> The Vision Transformer, or ViT, is a model for image classification that employs a Transformer-like architecture over patches of the image. An image is split into fixed-size patches, each of them are then linearly embedded, position embeddings are added, and the resulting sequence of vectors is fed to a standard Transformer encoder. In order to perform classification, the standard approach of adding an extra learnable “classification token” to the sequence is used.

# 2. GPT2
![](https://jalammar.github.io/images/gpt2/gpt-2-layers-2.png)
> GPT-2 is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. inputs are sequences of continuous text of a certain length and the targets are the same sequence, shifted one token (word or piece of word) to the right. The model uses internally a mask-mechanism to make sure the predictions for the token i only uses the inputs from 1 to i but not the future tokens. -->

# Import

In [65]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from pathlib import Path
from tqdm.auto import tqdm
import multiprocessing as mp
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import io, transforms
from torch.utils.data import Dataset, DataLoader, random_split

# hugginface
import datasets  # https://pypi.org/project/datasets/
import evaluate

from transformers import (
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    VisionEncoderDecoderModel,
    ViTImageProcessor,
#     ViTFeatureExtractor,
    AutoTokenizer,
    GPT2Config,
    default_data_collator
)


import nltk

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"There are {torch.cuda.device_count()} GPU(s) available")
#     print(f"We will use the GPU: {torch.cuda.get_device_name(0)}")
    print("The GPU(s) are as follows:")
    for each in range(torch.cuda.device_count()):
        print(f"{each} : {torch.cuda.get_device_properties(each).name}")
else:
    print("No GPU available, usig the CPU instead.")
    device = torch.device("CPU")

There are 2 GPU(s) available
The GPU(s) are as follows:
0 : NVIDIA GeForce GTX 1080 Ti
1 : NVIDIA GeForce GTX 1080 Ti


# Parameters

In [3]:
# os.environ["WANDB_DISABLED"] = "true"
class config: 
    ENCODER = "google/vit-base-patch16-224"
    # ENCODER = "google/vit-base-patch16-224-in21k"
    DECODER = 'gpt2'
    TRAIN_BATCH_SIZE = 8
    VAL_BATCH_SIZE = 1
    VAL_EPOCHS = 1
    LR = 5e-5
    SEED = 42
    MAX_LEN = 128
    SUMMARY_LEN = 20
    WEIGHT_DECAY = 0.01
    MEAN = (0.485, 0.456, 0.406)
    STD = (0.229, 0.224, 0.225)
    TRAIN_PCT = 0.95
    NUM_WORKERS = mp.cpu_count() # number of logical CPU cores
    EPOCHS = 3
    IMG_SIZE = (224, 224)
    LABEL_MASK = -100
    TOP_K = 1000
    TOP_P = 0.95 
#     CAPTION = "/media/loveplay1983/data/ML/imgcap/Flickr-8k/captions.txt"
#     IMAGE_DIR = "/media/loveplay1983/data/ML/imgcap/Flickr-8k/Images"
    CAPTION = "/media/dllab/AI-PhD-Study/test-dataset/imgcap/flickr8k/captions.txt"
    IMAGE_DIR = "/media/dllab/AI-PhD-Study/test-dataset/imgcap/flickr8k/Images"

# Helper functions

In [4]:
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
    """
    bos - begin of special 
    eos - end of special 
    """
    outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
    return outputs

AutoTokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens

In [5]:
# # rouge = datasets.load_metric("rouge", trust_remote_code=True)
# rouge = evaluate.load("rouge", trust_remote_code=True)

# def compute_metrics(pred):
#     labels_ids = pred.label_ids
#     pred_ids = pred.predictions

#     # all unnecessary tokens are removed
#     pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
#     labels_ids[labels_ids == -100] = tokenizer.pad_token_id
#     label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

#     rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

#     return {
#         "rouge2_precision": round(rouge_output.precision, 4),
#         "rouge2_recall": round(rouge_output.recall, 4),
#         "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
#     }

In [None]:
# rouge = datasets.load_metric("rouge", trust_remote_code=True)
rouge = evaluate.load("rouge", trust_remote_code=True)

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"]

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

evaluate.combine

# Error with `'numpy.float64' object has no attribute 'mid'` 

- evaluate vs datasets  
`Because evaluate module does not have mid`

In [None]:
# example from https://ankur3107.github.io/blogs/the-illustrated-image-captioning-using-transformers/
ignore_pad_token_for_loss = True


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    if ignore_pad_token_for_loss:
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds,
                                                     decoded_labels)

    result = metric.compute(predictions=decoded_preds,
                            references=decoded_labels,
                            use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    return result

**Stemming**:

- Stemming is a text processing technique that reduces words to their base or root form.
- For example, "running", "runs", and "ran" would all be stemmed to "run".


**Porter Stemmer**:

- The Porter stemmer is a widely used algorithm for stemming English words.
- It removes common suffixes like "-ing", "-ed", "-s", etc., aiming to capture the core meaning of the word.


`use_stemmer` Option:

- This option allows you to control whether the Porter stemmer is applied during text processing.
- Setting it to True enables stemming, while setting it to False disables it.

In [71]:
test_rouge = datasets.load_metric("rouge", trust_remote_code=True)

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
    
    return preds, labels

preds = ["hello world", "what is rouge"]
labels = ["morning world", "what is rouge"]


preds,labels = postprocess_text(preds, labels)


result = test_rouge.compute(predictions=preds,
                            references=labels,
                            rouge_types=["rouge2"],
                            use_stemmer=True)["rouge2"].mid
# result = {k: round(v*100, 4) for k, v in result.items()}
# prediction_lens = [
#     np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
# ]
# result["gen_len"] = np.mean(prediction_lens)
print(result)




Score(precision=0.5, recall=0.5, fmeasure=0.5)


# Dataset

In [6]:
# Feature extractor and tokenizer
# feature_extractor = ViTFeatureExtractor.from_pretrained(config.ENCODER)
feature_extractor = ViTImageProcessor.from_pretrained(config.ENCODER)
tokenizer = AutoTokenizer.from_pretrained(config.DECODER)
tokenizer.pad_token = tokenizer.unk_token

In [7]:
# Transforms and dataframe
# img 224,224
# normalization
# converting img to tensor

transforms = transforms.Compose([
    transforms.Resize(config.IMG_SIZE), 
    transforms.ToTensor(),  # ToTensor convert the data into Tensor and constrain the data in [0,1]
    transforms.Normalize(mean=0.0, std=1.0)
])

In [8]:
df = pd.read_csv(config.CAPTION)
train_df, val_df = train_test_split(df, test_size=0.2)
df.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [25]:
train_df.sizeze

64728

In [26]:
val_df.size

16182

In [9]:
# Dataset class

The dataset is created following these steps: 
> - read the image using the Image function of PIL library
>- The image is transformed using the transformed defined above
>- The transformed image is passed through the feature extractor to extract the pixel values from the image
>- The captions are loaded from the dataframe
>- The captions are tokenized
>- The tokenized captions are padded to max length
>- The images and tokenized captions are returned

In [10]:
# def min_max_img(img):
#     min = np.min(img)
#     max = np.max(img)
#     return (img - min) / (max - min)

In [11]:
class ImgDataset(Dataset):
    def __init__(self, df, root_dir, tokenizer, feature_extractor, transform=None):
        self.df = df
        self.transform = transform
        self.root_dir = root_dir
        self.tokenizer = tokenizer
        self.feature_extractor = feature_extractor
        self.max_length = 50

    def __len__(self,):
        return len(self.df)

    def __getitem__(self, idx):
        # choosing the img and caption name along column index
        caption = self.df.caption.iloc[idx]
        image = self.df.image.iloc[idx]
        img_path = os.path.join(self.root_dir , image)
        img = Image.open(img_path).convert("RGB")

        if self.transform is not None:
            # using pytorch transform to process the image which loaded by PIL Image
            img = self.transform(img)

        # Generate image and caption embedding
        # 1. using normalized data as input to feature_extractor for fast computation
        # 2. converting the output of extracted feature back to pixel val by .pixel_values
        pixel_values = self.feature_extractor(img, return_tensors="pt").pixel_values
        
        captions = self.tokenizer(
            caption,
            padding="max_length",
            max_length=self.max_length    
        ).input_ids

        # Filter captions
        # this filtering step ensures that padding tokens within the captions are replaced 
        # with a specific value (here, -100) to prevent them from affecting the model's training.
        captions = [
            caption if caption != self.tokenizer.pad_token_id else -100 for caption in captions    
        ]

        # Combine image and caption embedding into a dict 
        encoding = {
            "pixel_values": pixel_values.squeeze(), 
            "labels": torch.tensor(captions)   
        }
        
        return encoding

In [12]:
# Train and validation dataset
train_dataset = ImgDataset(
    train_df, 
    root_dir = config.IMAGE_DIR,
    tokenizer = tokenizer,
    feature_extractor=feature_extractor,
    transform=transforms
)

val_dataset = ImgDataset(
    val_df,
    root_dir=config.IMAGE_DIR,
    tokenizer=tokenizer,
    feature_extractor=feature_extractor,
    transform=transforms
)

# Model Building

In [13]:
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(config.ENCODER, config.DECODER)

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.crossattention.c_attn.bias', 'h.0.crossattention.c_attn.weight', 'h.0.crossattention.c_proj.bias', 'h.0.crossattention.c_proj.weight', 'h.0.crossattention.q_attn.bias', 'h.0.crossattention.q_attn.weight', 'h.0.ln_cross_attn.bias', 'h.0.ln_cross_attn.weight', 'h.1.crossattention.c_attn.bias', 'h.1.crossattention.c_attn.weight', 'h.1.crossattention.c_proj.bias', 'h.1.crossattention.c_proj.weight', 'h.1.crossattention.q_attn.bias', 'h.1.crossattention.q_attn.weight', 'h.1.ln_cross_attn.bias', 'h.1.ln_cross_attn.weight', 'h.10.crossattention.c_attn.bias', 'h.10.crossattention.c_attn.

> The benefit of using the separation token ([SEP]) as an end-of-sequence (EOS) marker in text generation is indeed partially due to the fact that natural language sequences often don't end in long blocks of text, but rather in smaller chunks like paragraphs or sentences  


**1. Setting Special Token IDs:**

- **`# model.config.decoder_start_token_id = tokenizer.cls_token_id` (Commented Out):**
  - This line, although commented out, attempts to set the decoder start token ID in the model's configuration. It uses the tokenizer's `cls_token_id` (classification token), which might not be ideal for text generation tasks.
  - A more appropriate token for text generation is the `bos_token_id` (beginning-of-sequence) which is used later in the code (`model.config.decoder_start_token_id = tokenizer.bos_token_id`).
- **`model.config.pad_token_id = tokenizer.pad_token_id`:**
  - This line sets the pad token ID in the model's configuration, aligning it with the tokenizer's `pad_token_id`. Pad tokens are used for padding sequences to a fixed length during generation.

**2. Verifying Vocabulary Size:**

- **`# make sure vocab size is set correctly`** (Comment):
  - This comment highlights the importance of ensuring that the vocabulary size (`model.config.vocab_size`) in the model's configuration matches the actual vocabulary size of the decoder (`model.config.decoder.vocab_size`). Any mismatch could lead to errors during generation.
- **`model.config.vocab_size = model.config.decoder.vocab_size`:**
  - This line explicitly sets the model's vocabulary size (`model.config.vocab_size`) to the decoder's vocabulary size (`model.config.decoder.vocab_size`). This ensures consistency and helps prevent potential issues.

**3. Beam Search Parameters:**

- **`model.config.eos_token_id = tokenizer.sep_token_id`:**
  - This line sets the end-of-sequence (EOS) token ID in the model's configuration. It uses the tokenizer's `sep_token_id` (separation token) to mark the end of the generated sequence.
- **`model.config.decoder_start_token_id = tokenizer.bos_token_id`:**
  - This line correctly sets the decoder start token ID to the tokenizer's `bos_token_id`. This token signifies the beginning of the generated sequence.
- **`model.config.max_length = 128`:**
  - This line sets the maximum length of the generated sequence to 128 tokens. This limits the output length to avoid overly long or repetitive generations.
- **`model.config.early_stopping = True`:**
  - This line enables early stopping during beam search. The search stops after a certain number of beams are completed, potentially improving efficiency.
- **`model.config.no_repeat_ngram_size = 3`:**
  - This line sets the no-repeat n-gram size for beam search. It penalizes sequences that contain repeated n-grams (sequences of n consecutive tokens) of size 3 or less. This helps generate more diverse outputs.
- **`model.config.length_penalty = 2.0`:**
  - This line sets the length penalty for beam search. It favors shorter sequences by applying a penalty proportional to the sequence length. This discourages overly long outputs.
- **`model.config.num_beams = 4`:**
  - This line sets the number of beams to use in beam search. The model will explore and expand the 4 most promising partial sequences at each step. This allows for a balance between exploration and exploitation during generation.

**In summary, these lines configure the model for text generation using beam search with specific parameters to control the length, diversity, and quality of the generated outputs.**

**Additional Notes:**

- The initial attempt to set `decoder_start_token_id` with `cls_token_id` is likely a mistake, and `bos_token_id` is more suitable.
- Double-check that the tokenizer's special tokens (`bos_token_id`, `eos_token_id`, `pad_token_id`) align with the model's expectations.
- You might need to adjust these parameters (e.g., `max_length`, `num_beams`) based on your specific task and desired output characteristics.


In [14]:
# model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size
# set beam search parameters
model.config.eos_token_id = tokenizer.sep_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.max_length = 128
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

# Training

> "Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the "
            "--report_to flag to control the integrations used for logging result (for instance --report_to none)."

In [15]:
# Training arguments

training_args = Seq2SeqTrainingArguments(
    output_dir='ViT_large_gpt2',
    per_device_train_batch_size=config.TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=config.VAL_BATCH_SIZE,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    do_train=True,
    do_eval=True,
    logging_steps=1024,  
    save_steps=2048, 
    warmup_steps=1024,  
    learning_rate = 5e-5,
    #max_steps=1500, # delete for full training
    num_train_epochs = config.EPOCHS, #TRAIN_EPOCHS
    overwrite_output_dir=True,
    save_total_limit=1,
    report_to=None   
)



In [16]:
# Training with Seq2SeqTrainer 

trainer = Seq2SeqTrainer(
    tokenizer = feature_extractor,
    model = model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    data_collator = default_data_collator,
)

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.


Epoch,Training Loss,Validation Loss




AttributeError: 'numpy.float64' object has no attribute 'mid'