In [6]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import ViTFeatureExtractor, ViTModel, AutoTokenizer, AutoModelForCausalLM, AdamW


In [7]:
!kaggle datasets download -d vitngquang/uit-viic-v1-0-vietnamese-image-captioning

  pid, fd = os.forkpty()


Dataset URL: https://www.kaggle.com/datasets/vitngquang/uit-viic-v1-0-vietnamese-image-captioning
License(s): other
Downloading uit-viic-v1-0-vietnamese-image-captioning.zip to /kaggle/working
100%|███████████████████████████████████████| 1.23G/1.23G [00:08<00:00, 183MB/s]
100%|███████████████████████████████████████| 1.23G/1.23G [00:08<00:00, 152MB/s]


In [8]:
!unzip -q uit-viic-v1-0-vietnamese-image-captioning.zip

In [9]:
def load_data(path):
    with open(path + 'captions.txt', 'r', encoding='utf-8') as file:
        lines = file.readlines()

    data = [line.split('\t', 1) for line in lines]
    df = pd.DataFrame(data, columns=["image_path", "caption"])

    return df

In [10]:
dataset_path = '/kaggle/working/dataset' # nếu dùng kaggle
train_path = dataset_path + '/train/'
val_path = dataset_path + '/val/'
test_path = dataset_path + '/test/'

In [11]:
df_train = load_data(train_path)
df_val = load_data(val_path)
df_test = load_data(test_path)

In [12]:
def text_preprocessing(df):
    df_preprocessed = df.copy()
    df_preprocessed['caption'] = df_preprocessed['caption'].apply(lambda x: x.lower())
    df_preprocessed['caption'] = df_preprocessed['caption'].apply(lambda x: x.replace("[^A-Za-z]",""))
    df_preprocessed['caption'] = df_preprocessed['caption'].apply(lambda x: x.replace("\s+"," "))
    df_preprocessed['caption'] = df_preprocessed['caption'].apply(lambda x: " ".join([word for word in x.split() if len(word)>1]))
    df_preprocessed['caption'] = "startseq "+df_preprocessed['caption']+" endseq"
    df_preprocessed['image_path'] = df_preprocessed['image_path'].str.replace('^/dataset', '', regex=True)
    df_preprocessed['image_path'] = dataset_path + df_preprocessed['image_path']
    return df_preprocessed

In [13]:
df_train_preprocessed = text_preprocessing(df_train)
df_val_preprocessed = text_preprocessing(df_val)
df_test_preprocessed = text_preprocessing(df_test)

In [14]:
from transformers import ViTFeatureExtractor, ViTModel,ViTImageProcessor
import torch
from PIL import Image
import numpy as np

In [15]:
# Tải mô hình và bộ trích xuất đặc trưng ViT
feature_extractor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

In [16]:
# Danh sách ảnh (thay bằng df_train_preprocessed['image_path'] và df_val_preprocessed['image_path'])
all_images = df_train_preprocessed['image_path'].unique().tolist() + df_val_preprocessed['image_path'].unique().tolist()

# Kích thước ảnh đầu vào
img_size = 224

# Dictionary để lưu trữ đặc trưng của từng ảnh
features = {}

# Duyệt qua tất cả các ảnh
for image_path in tqdm(all_images):
    # Mở ảnh và tiền xử lý
    img = Image.open(image_path)
    
    # Nếu ảnh là grayscale, chuyển nó thành RGB
    if img.mode != 'RGB':
        img = img.convert('RGB')
    
    img = img.resize((img_size, img_size))  # Điều chỉnh kích thước ảnh
    
    # Tiền xử lý ảnh: chuẩn hóa và chuyển sang tensor
    inputs = feature_extractor(images=img, return_tensors="pt")
    
    # Trích xuất đặc trưng từ mô hình ViT
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Lấy thông tin đặc trưng từ đầu ra (trong trường hợp này là 'last_hidden_state')
    feature = outputs.last_hidden_state.squeeze().cpu().numpy()
    
    # Lưu đặc trưng vào dictionary
    features[image_path] = feature

# Lưu các đặc trưng vào tệp hoặc tiếp tục xử lý
print("Đặc trưng đã được trích xuất thành công!")

100%|██████████| 3619/3619 [13:24<00:00,  4.50it/s]

Đặc trưng đã được trích xuất thành công!





In [17]:
np.savez("features.npz", **features) # lưu features trích xuất từ các images về máy

In [18]:
tokenizer = AutoTokenizer.from_pretrained("NlpHUST/gpt2-vietnamese")
gpt2_model = AutoModelForCausalLM.from_pretrained("NlpHUST/gpt2-vietnamese")

tokenizer_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/854k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/512k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/884 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

In [19]:
# Kiểm tra đặc trưng từ file đã lưu
features = np.load("features.npz")


In [20]:
# Chuẩn bị dữ liệu: Kết hợp embedding hình ảnh với caption
image_features = []
captions = []

for image_path, caption in zip(df_train_preprocessed['image_path'], df_train_preprocessed['caption']):
    if image_path in features:
        image_features.append(features[image_path])
        captions.append(caption)

In [21]:
# Gán eos_token làm pad_token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize captions
inputs = tokenizer(captions, return_tensors="pt", padding=True, truncation=True)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [22]:
# Nếu thêm [PAD], hãy đảm bảo resize embedding
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
gpt2_model.resize_token_embeddings(len(tokenizer))


Embedding(50259, 768)

In [23]:
import torch.nn as nn

# Kích thước embedding
image_dim = 768  # Kích thước đầu ra của ViT
gpt2_dim = gpt2_model.config.hidden_size

# Mạng kết nối
class MappingNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MappingNetwork, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
        self.activation = nn.Tanh()
    
    def forward(self, x):
        x = self.linear(x)
        return self.activation(x)

mapping_network = MappingNetwork(image_dim, gpt2_dim)

In [24]:
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW

# Chuẩn bị DataLoader
image_features = torch.tensor(image_features)
captions_input_ids = inputs['input_ids']
dataset = TensorDataset(image_features, captions_input_ids)

# Tối ưu hóa
optimizer = AdamW(list(mapping_network.parameters()) + list(gpt2_model.parameters()), lr=5e-5)

  image_features = torch.tensor(image_features)


In [25]:
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

In [26]:
from torch.optim import AdamW
from torch.amp import GradScaler  # Sử dụng GradScaler từ torch.amp

# Optimizer và scaler
optimizer = AdamW(gpt2_model.parameters(), lr=1e-4)

scaler = GradScaler(device='cuda')


In [27]:
import torch

def save_checkpoint(epoch, model, optimizer, scaler, path="checkpoint.pth"):
    checkpoint = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "scaler_state_dict": scaler.state_dict() if scaler else None,
    }
    torch.save(checkpoint, path)
    print(f"Checkpoint saved at epoch {epoch + 1} to {path}")

In [28]:
from torch.amp import autocast
for epoch in range(5):
    print(f"Starting Epoch {epoch + 1}...")
    dataloader_iter = tqdm(enumerate(dataloader), total=len(dataloader))
    for i, (img_feat, caption_ids) in dataloader_iter:
        try:
            with autocast(device_type='cuda'):
                seq_len = caption_ids.shape[1]
                img_embedding = mapping_network(img_feat)
                img_embedding = img_embedding[:, :seq_len, :]
                caption_embedding = gpt2_model.transformer.wte(caption_ids)
                inputs_embeds = torch.cat((img_embedding, caption_embedding), dim=1)
                labels = torch.cat((
                    torch.full((img_embedding.shape[0], img_embedding.shape[1]), -100, device=caption_ids.device),
                    caption_ids
                ), dim=1)

                outputs = gpt2_model(inputs_embeds=inputs_embeds, labels=labels)
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

            dataloader_iter.set_description(f"Epoch {epoch + 1}")
            dataloader_iter.set_postfix(loss=loss.item())

        except Exception as e:
            print(f"Error in Batch {i + 1}: {e}")
            continue
    
    # Lưu checkpoint sau mỗi epoch
    save_checkpoint(epoch, gpt2_model, optimizer, scaler, path="checkpoint.pth")
    print(f"Epoch {epoch + 1} completed.")
    print(f"Epoch {epoch + 1} - Loss: {loss.item()}")


Starting Epoch 1...


Epoch 1: 100%|██████████| 211/211 [1:21:17<00:00, 23.11s/it, loss=0.454]


Checkpoint saved at epoch 1 to checkpoint.pth
Epoch 1 completed.
Epoch 1 - Loss: 0.45360246300697327
Starting Epoch 2...


Epoch 2: 100%|██████████| 211/211 [1:21:32<00:00, 23.19s/it, loss=0.373]


Checkpoint saved at epoch 2 to checkpoint.pth
Epoch 2 completed.
Epoch 2 - Loss: 0.37283942103385925
Starting Epoch 3...


Epoch 3: 100%|██████████| 211/211 [1:22:08<00:00, 23.36s/it, loss=0.393]


Checkpoint saved at epoch 3 to checkpoint.pth
Epoch 3 completed.
Epoch 3 - Loss: 0.39291220903396606
Starting Epoch 4...


Epoch 4: 100%|██████████| 211/211 [1:20:50<00:00, 22.99s/it, loss=0.304]


Checkpoint saved at epoch 4 to checkpoint.pth
Epoch 4 completed.
Epoch 4 - Loss: 0.3038335144519806
Starting Epoch 5...


Epoch 5: 100%|██████████| 211/211 [1:21:44<00:00, 23.25s/it, loss=0.268]


Checkpoint saved at epoch 5 to checkpoint.pth
Epoch 5 completed.
Epoch 5 - Loss: 0.26789599657058716


In [None]:
model.save

In [None]:
df_test_preprocessed['caption'].iloc[7]

In [None]:
loaded_data = np.load("/kaggle/working/features.npz", allow_pickle=True)
features = dict(loaded_data)

In [None]:
df_test_preprocessed['caption'] = df_test_preprocessed['caption'].str.replace(r'^startseq ', '', regex=True)
df_test_preprocessed['caption'] = df_test_preprocessed['caption'].str.replace(r' endseq$', '', regex=True)

In [None]:
df_test_merge = df_test_preprocessed.groupby('image_path')['caption'].apply(list).reset_index()

In [None]:
df_test_merge

In [None]:
for index,record in df_test_merge.iterrows():
    img = load_img(record['image_path'],target_size=(224,224))
    img = img_to_array(img)
    img = img/255.

    caption = predict_caption(gpt2_model, record['image_path'], tokenizer, max_length, features)
    df_test_merge.loc[index,'predict'] = caption

In [None]:
df_test_merge['predict'] = df_test_merge['predict'].str.replace(r'^startseq ', '', regex=True)
df_test_merge['predict'] = df_test_merge['predict'].str.replace(r' endseq$', '', regex=True)

In [None]:
df_test_merge

In [None]:
def calculate_bleu_scores(row):
    reference = [caption.split() for caption in row['caption']]
    prediction = row['predict'].split()

    smoothie = SmoothingFunction().method4

    bleu_1 = sentence_bleu(reference, prediction, weights=(1.0, 0, 0, 0), smoothing_function=smoothie)
    bleu_2 = sentence_bleu(reference, prediction, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)
    bleu_3 = sentence_bleu(reference, prediction, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothie)
    bleu_4 = sentence_bleu(reference, prediction, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)

    return pd.Series([bleu_1, bleu_2, bleu_3, bleu_4])

In [None]:
df_test_merge[['bleu_1', 'bleu_2', 'bleu_3', 'bleu_4']] = df_test_merge.apply(calculate_bleu_scores, axis=1)