In [1]:
import os
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
from torch.amp import GradScaler, autocast

In [2]:
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Using device:", device)
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_to

Using device: cuda
GPU: NVIDIA GeForce RTX 3050 Laptop GPU


In [3]:
caption_file = "C:/Users/ADARSH S/OneDrive/Desktop/GenAI Project/dataset/captions.txt"
image_folder = "C:/Users/ADARSH S/OneDrive/Desktop/GenAI Project/dataset/Images"

df = pd.read_csv(caption_file, sep=',', names=['image', 'caption'])
df['image'] = df['image'].apply(lambda x: x.split('#')[0])
df = df.groupby('image').first().reset_index()
df = df[df['image'].apply(lambda x: os.path.exists(os.path.join(image_folder, x)))]
print("Total usable images:", len(df))


Total usable images: 8091


In [None]:
class FlickrDataset(Dataset):
    def __init__(self, dataframe, image_dir, feature_extractor, tokenizer, max_length=16):
        self.df = dataframe
        self.image_dir = image_dir
        self.feature_extractor = feature_extractor
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.image_dir, row['image'])
        caption = row['caption']

        image = Image.open(image_path).convert("RGB")
        pixel_values = self.feature_extractor(images=image, return_tensors="pt").pixel_values[0]

        tokens = self.tokenizer(
            caption,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            "pixel_values": pixel_values,
            "input_ids": tokens["input_ids"][0],
            "attention_mask": tokens["attention_mask"][0]
        }


In [5]:
batch_size = 16 
dataset = FlickrDataset(df, image_folder, feature_extractor, tokenizer)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=0)


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
scaler = GradScaler(device='cuda')

epochs = 10

for epoch in range(1,epochs):
    model.train()
    print(f"\nEpoch {epoch + 1}")
    total_loss = 0

    for batch in tqdm(train_loader):
        pixel_values = batch["pixel_values"].to(device, non_blocking=True)
        input_ids = batch["input_ids"].to(device, non_blocking=True)

        with autocast(device_type='cuda'):
            outputs = model(pixel_values=pixel_values, labels=input_ids)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Average loss: {avg_loss:.4f}")

    # Save after every epoch
    output_dir = f"fine-tuned-captioning-epoch{epoch+1:02d}"
    model.save_pretrained(output_dir)
    feature_extractor.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model saved to: {output_dir}")



Epoch 2


100%|██████████| 506/506 [50:20<00:00,  5.97s/it] 


Average loss: 2.6464
Model saved to: fine-tuned-captioning-epoch02

Epoch 3


100%|██████████| 506/506 [48:34<00:00,  5.76s/it]


Average loss: 2.3070
Model saved to: fine-tuned-captioning-epoch03


In [None]:
model.save_pretrained("vit-gpt2-captioning-model")
feature_extractor.save_pretrained("vit-gpt2-captioning-model")
tokenizer.save_pretrained("vit-gpt2-captioning-model")


('vit-gpt2-captioning-model\\tokenizer_config.json',
 'vit-gpt2-captioning-model\\special_tokens_map.json',
 'vit-gpt2-captioning-model\\vocab.json',
 'vit-gpt2-captioning-model\\merges.txt',
 'vit-gpt2-captioning-model\\added_tokens.json',
 'vit-gpt2-captioning-model\\tokenizer.json')