In [1]:
!pip install datasets
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


## Import thư viện

In [2]:
import datasets
import evaluate

import os
from transformers import VisionEncoderDecoderModel, AutoFeatureExtractor, AutoTokenizer
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import EarlyStoppingCallback
from transformers import default_data_collator

from sklearn.model_selection import train_test_split

import pandas as pd
from PIL import Image
import nltk
import numpy as np


## Define path

In [3]:
image_dir = '/kaggle/input/uit-viic-v1-0-vietnamese-image-captioning/file'
data_dir = '/kaggle/input/uit-viic-preprocessed/Dataset_captions/Dataset_captions'
save_dir = '/kaggle/working/ViT_GPT2_Vn_model'

## Load model

In [4]:
image_encoder_model = "google/vit-base-patch16-224-in21k"
text_decode_model = "NlpHUST/gpt2-vietnamese"

model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( image_encoder_model, text_decode_model )
feature_extractor = AutoFeatureExtractor.from_pretrained( image_encoder_model )
tokenizer = AutoTokenizer.from_pretrained( text_decode_model )

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/884 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at NlpHUST/gpt2-vietnamese and are newly initialized: ['h.0.crossattention.c_attn.bias', 'h.0.crossattention.c_attn.weight', 'h.0.crossattention.c_proj.bias', 'h.0.crossattention.c_proj.weight', 'h.0.crossattention.q_attn.bias', 'h.0.crossattention.q_attn.weight', 'h.0.ln_cross_attn.bias', 'h.0.ln_cross_attn.weight', 'h.1.crossattention.c_attn.bias', 'h.1.crossattention.c_attn.weight', 'h.1.crossattention.c_proj.bias', 'h.1.crossattention.c_proj.weight', 'h.1.crossattention.q_attn.bias', 'h.1.crossattention.q_attn.weight', 'h.1.ln_cross_attn.bias', 'h.1.ln_cross_attn.weight', 'h.10.crossattention.c_attn.bias', 'h.10.crossattention.c_attn.weight', 'h.10.crossattention.c_proj.bias', 'h.10.crossattention.c_proj.weight', 'h.10.crossattention.q_attn.bias', 'h.10.crossattention.q_attn.weight', 'h.10.ln_cross_attn.bias', 'h.10.ln_cross_attn.weight', 'h.11.crossattention.c_attn.bias', 'h.11.crossattention.c_attn.wei

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/854k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/512k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

## Initialize missing weights

In [5]:
# The tie_weights() method will attempt to tie the weights of the decoder's language modeling head (lm_head) to the embedding layer. 
# This is often done in language models to share weights and improve efficiency. 
# In this case, it will effectively initialize the missing decoder.lm_head.weight by copying weights from the embedding layer.

model.tie_weights() # Copy weights from the embedding layer to the lm_head

## Define special token

In [6]:
# GPT2 only has bos/eos tokens but not decoder_start/pad tokens
# Ensure padding token is set before other tokens
tokenizer.add_special_tokens( {'pad_token': '<pad>'} )

tokenizer.pad_token_id = tokenizer.pad_token_id

# Update the model config
model.config.eos_token_id = tokenizer.eos_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id  

# Resize the embedding layer of the model to include the pad token
model.decoder.resize_token_embeddings( len(tokenizer) )

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(50258, 768)

## Data

### Train Val

In [7]:
train_path = '/kaggle/input/uit-viic-preprocessed/Dataset_captions/Dataset_captions/train_val_captions.csv'

df_train = pd.read_csv( train_path )

df_train.head()

Unnamed: 0,image_path,caption,preprocessed_caption
0,/dataset/train/images/000000157656.jpg,Người đàn ông đang đánh tennis ngoài sân.,<start> người đàn_ông đang đánh tennis ngoài s...
1,/dataset/train/images/000000157656.jpg,Một vận động viên tennis đang vung vợt đánh bóng.,<start> một vận_động_viên tennis đang vung vợt...
2,/dataset/train/images/000000157656.jpg,Một cầu thủ tennis đang vung vợt tennis đỡ bóng.,<start> một cầu_thủ tennis đang vung vợt tenni...
3,/dataset/train/images/000000157656.jpg,Người đàn ông đang đứng ngoài biên cầm vợt sẵn...,<start> người đàn_ông đang đứng ngoài biên cầm...
4,/dataset/train/images/000000157656.jpg,Vận động viên tennis nam đang trong tư thế chu...,<start> vận_động_viên tennis nam đang trong tư...


### Train val split

In [8]:
df_train, df_val = train_test_split( df_train, test_size=0.15, random_state=42 )
print( df_train.shape )
print( df_val.shape )

(15385, 3)
(2716, 3)


In [9]:
train_path = '/kaggle/working/data/train.csv'
val_path = '/kaggle/working/data/val.csv'

if not os.path.exists( '/kaggle/working/data' ):
    os.makedirs( '/kaggle/working/data' )

df_train.to_csv( train_path, index=False )
df_val.to_csv( val_path, index=False )

In [10]:
del df_train
del df_val

### Load train

In [11]:
train_data = datasets.load_dataset( 'csv', data_files=train_path )
train_data

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image_path', 'caption', 'preprocessed_caption'],
        num_rows: 15385
    })
})

### Load val

In [12]:
val_data = datasets.load_dataset( 'csv', data_files=val_path )
val_data

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image_path', 'caption', 'preprocessed_caption'],
        num_rows: 2716
    })
})

## Preprocessing

In [13]:
# image preprocessing step
def feature_extraction_fn(image_paths, check_image=True):
    """
    Run feature extraction on images
    If `check_image` is `True`, the examples that fails during `Image.open()` will be caught and discarded.
    Otherwise, an exception will be thrown.
    """

    model_inputs = {}

    if check_image:
        images = []
        to_keep = []
        for image_file in image_paths:
            try:
                img = Image.open(image_file).convert('RGB')
                images.append(img)
                to_keep.append(True)
            except Exception:
                to_keep.append(False)
    else:
        images = [Image.open(image_file).convert('RGB') for image_file in image_paths]

    encoder_inputs = feature_extractor(images=images, return_tensors="np")

    return encoder_inputs.pixel_values

def preprocess_fn(examples, max_target_length, check_image = False):
    """Run tokenization + image feature extraction"""
    paths = examples['image_path']
    image_paths = [ image_dir + path for path in paths ]
    captions = examples['caption']
    
    model_inputs = {}
    # This contains image path column
    labels = tokenizer(captions, 
                      padding="max_length", 
                      max_length=max_target_length)
    
    model_inputs['labels'] = labels.input_ids
    model_inputs['attention_mask'] = labels.attention_mask
    model_inputs['pixel_values'] = feature_extraction_fn(image_paths, check_image=check_image)

    return model_inputs
# end

In [14]:
processed_train_data = train_data.map(
    function=preprocess_fn,
    batched=True,
    fn_kwargs={"max_target_length": 128},
    remove_columns=train_data['train'].column_names
)

processed_train_data

Map:   0%|          | 0/15385 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'attention_mask', 'pixel_values'],
        num_rows: 15385
    })
})

In [15]:
processed_val_data = val_data.map(
    function=preprocess_fn,
    batched=True,
    fn_kwargs={"max_target_length": 128},
    remove_columns=val_data['train'].column_names
)

processed_val_data

Map:   0%|          | 0/2716 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'attention_mask', 'pixel_values'],
        num_rows: 2716
    })
})

## Define seq2seq training arguments

In [16]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import EarlyStoppingCallback

training_args = Seq2SeqTrainingArguments(
    predict_with_generate = True,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    output_dir = save_dir,
    num_train_epochs = 100,
    report_to = "none",  # Add this line to disable wandb
    load_best_model_at_end = True,  # Enable loading best model at the end
    metric_for_best_model = "eval_loss",  # Specify metric for selecting best model (e.g., "bleu")
    prediction_loss_only = False,  # Enable metric calculation
    save_total_limit = 1  # Only keep the best model checkpoint
)

# Instantiate the EarlyStoppingCallback
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience = 5,
                                                early_stopping_threshold = 0.01)  # Optional threshold

## Define seq2seq trainer

In [17]:
from transformers import default_data_collator

# instantiate trainer
trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset = processed_train_data['train'],  # Pass the train dataset
    eval_dataset = processed_val_data['train'],   # Pass the eval dataset
    data_collator = default_data_collator,
    callbacks = [early_stopping_callback]
)

## Train

In [18]:
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,No log,0.132604
2,0.639100,0.117647
3,0.119400,0.109191
4,0.104100,0.104497
5,0.093600,0.102856
6,0.084400,0.100383
7,0.077100,0.100538


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
There were missing keys in the checkpoint model loaded: ['decoder.lm_head.weight'].


TrainOutput(global_step=3367, training_loss=0.17386595134321575, metrics={'train_runtime': 16462.1964, 'train_samples_per_second': 93.457, 'train_steps_per_second': 2.922, 'total_flos': 1.9435058754182185e+19, 'train_loss': 0.17386595134321575, 'epoch': 7.0})