In [1]:
#import libraries
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm
import pandas as pd
from transformers import BlipProcessor, BlipForConditionalGeneration, get_scheduler
from torch.optim import AdamW
from PIL import Image
import os

2025-12-22 18:52:21.786460: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766429541.969578      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766429542.022104      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766429542.460737      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766429542.460772      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766429542.460775      55 computation_placer.cc:177] computation placer alr

In [2]:
image_folder = "/kaggle/input/flickr8k/Images"
captions_file = "/kaggle/input/flickr8k/captions.txt"  # CSV: image,caption

batch_size = 8
epochs = 7
lr = 5e-5
max_length = 64

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


In [3]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")  #handles preprocessing
model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-base"
).to(device)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [4]:
class Flickr8kDataset(Dataset):
    def __init__(self, captions_file, image_folder, processor):
        self.data = []
        self.image_folder = image_folder
        self.processor = processor

        with open(captions_file, "r", encoding="utf-8") as f:
            lines = f.readlines()

        # skip header
        for line in lines[1:]:
            line = line.strip()
            if not line:
                continue

            # split commas one time because captions can has commas
            parts = line.split(",", 1)
            if len(parts) != 2:
                continue

            img_name, caption = parts
            img_path = os.path.join(self.image_folder, img_name)

            if not os.path.exists(img_path):
                continue

            self.data.append((img_name, caption))   # Stores valid samples in memory

        print("Total samples:", len(self.data))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name, caption = self.data[idx]   
        img_path = os.path.join(self.image_folder, img_name)

        image = Image.open(img_path).convert("RGB")

        encoding = self.processor(   # img encoder , caption decoder from blib
            images=image,
            text=caption,
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors="pt"  #returns the processed image + text as PyTorch tensors instead of lists or NumPy arrays
        )
        input_ids = encoding["input_ids"].squeeze(0)   # remove batch dimension
        attention_mask = encoding["attention_mask"].squeeze(0)
        pixel_values = encoding["pixel_values"].squeeze(0)

        labels = input_ids.clone()
        labels[labels == self.processor.tokenizer.pad_token_id] = -100 # Convert padding token IDs to -100 "pytourch crossentropy function ignore this value automatically" 

        return {
            "pixel_values": pixel_values,   # 224 224 3 
            "input_ids": input_ids,         # embedding
            "attention_mask": attention_mask, # give 1 if token zero if padding
            "labels": labels  # give -100 for padding 
        }



In [5]:
dataset = Flickr8kDataset(captions_file, image_folder, processor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

print("Number of batches:", len(dataloader))

Total samples: 40455
Number of batches: 5057


In [6]:
optimizer = AdamW(model.parameters(), lr=lr)

num_training_steps = epochs * len(dataloader)
scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,  # start with initial value without increase
    num_training_steps=num_training_steps
)

print("Training steps:", num_training_steps)

Training steps: 35399


In [7]:
model.train()  # dropout and gradient active

for epoch in range(epochs):
    loop = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
    total_loss = 0.0  # to accumulate batch losses

    for batch in loop:
        pixel_values = batch["pixel_values"].to(device)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()  # accumulate

        optimizer.zero_grad()  
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(dataloader)  # average over batches
    print(f"Epoch {epoch+1}/{epochs} - Average Loss: {avg_loss:.4f}")


Epoch 1/7: 100%|██████████| 5057/5057 [55:15<00:00,  1.53it/s]


Epoch 1/7 - Average Loss: 1.9890


Epoch 2/7: 100%|██████████| 5057/5057 [55:18<00:00,  1.52it/s]


Epoch 2/7 - Average Loss: 1.5720


Epoch 3/7: 100%|██████████| 5057/5057 [55:18<00:00,  1.52it/s]


Epoch 3/7 - Average Loss: 1.2803


Epoch 4/7: 100%|██████████| 5057/5057 [55:20<00:00,  1.52it/s]


Epoch 4/7 - Average Loss: 1.0073


Epoch 5/7: 100%|██████████| 5057/5057 [55:18<00:00,  1.52it/s]


Epoch 5/7 - Average Loss: 0.7538


Epoch 6/7: 100%|██████████| 5057/5057 [55:17<00:00,  1.52it/s]


Epoch 6/7 - Average Loss: 0.5420


Epoch 7/7: 100%|██████████| 5057/5057 [55:20<00:00,  1.52it/s] 

Epoch 7/7 - Average Loss: 0.3868





In [8]:
model.save_pretrained("./blip-finetuned-flickr8k")
processor.save_pretrained("./blip-finetuned-flickr8k")


[]

In [9]:
# load model
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

model_path = "./blip-finetuned-flickr8k"

processor = BlipProcessor.from_pretrained(model_path)
model = BlipForConditionalGeneration.from_pretrained(model_path).to(device)

Using device: cuda


In [10]:
# load data
df = pd.read_csv("/kaggle/input/arabic-to-english-translation-sentences/ara_eng.txt", sep="\t", names=["en","ar"])
df = df.dropna()
print(df.shape)
df.head()

(24638, 2)


Unnamed: 0,en,ar
0,Hi.,مرحبًا.
1,Run!,اركض!
2,Help!,النجدة!
3,Jump!,اقفز!
4,Stop!,قف!


In [11]:
# load pretrained model
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name, src_lang="en_XX", tgt_lang="ar_AR")
model = MBartForConditionalGeneration.from_pretrained(model_name).cuda() # load translation weights


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [12]:
class TranslationDataset(Dataset):
    def __init__(self, df):
        self.en = df["en"].tolist()
        self.ar = df["ar"].tolist()
    def __len__(self):
        return len(self.en)

    def __getitem__(self, idx):
        src = tokenizer(
            self.en[idx],
            padding="max_length",
            truncation=True,
            max_length=64,
            return_tensors="pt"
        )
        tgt = tokenizer(
            self.ar[idx],
            padding="max_length",
            truncation=True,
            max_length=64,
            return_tensors="pt"
        )
        labels = tgt["input_ids"].squeeze()   # Removes extra dimension [64] only 
        labels[labels == tokenizer.pad_token_id] = -100
        return {
            "input_ids": src["input_ids"].squeeze(),
            "attention_mask": src["attention_mask"].squeeze(),
            "labels": labels
        }

In [13]:
dataset = TranslationDataset(df)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True)

In [14]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [15]:
# start training 
epochs = 5
model.train()

for epoch in range(epochs):
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        batch = {k: v.cuda() for k, v in batch.items()}   # Moves all tensors in the batch to GPU
        
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"]
        )

        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} Loss = {total_loss/len(train_loader)}")

Epoch 1: 100%|██████████| 6160/6160 [47:18<00:00,  2.17it/s]


Epoch 1 Loss = 2.20207907373642


Epoch 2: 100%|██████████| 6160/6160 [47:20<00:00,  2.17it/s]


Epoch 2 Loss = 1.5835523873180537


Epoch 3: 100%|██████████| 6160/6160 [47:18<00:00,  2.17it/s]


Epoch 3 Loss = 1.2050856505315024


Epoch 4: 100%|██████████| 6160/6160 [47:19<00:00,  2.17it/s]


Epoch 4 Loss = 0.8990478008163053


Epoch 5: 100%|██████████| 6160/6160 [47:20<00:00,  2.17it/s]

Epoch 5 Loss = 0.6618388353314782





In [16]:
# save model
model.save_pretrained("mbart_en_ar_model")
tokenizer.save_pretrained("mbart_en_ar_model")




('mbart_en_ar_model/tokenizer_config.json',
 'mbart_en_ar_model/special_tokens_map.json',
 'mbart_en_ar_model/sentencepiece.bpe.model',
 'mbart_en_ar_model/added_tokens.json',
 'mbart_en_ar_model/tokenizer.json')

In [17]:
# load model 
model_dir = "/kaggle/working/mbart_en_ar_model"
tokenizer = MBart50TokenizerFast.from_pretrained(model_dir)
model = MBartForConditionalGeneration.from_pretrained(model_dir, device_map="auto")


In [18]:
import gradio as gr
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

# Load models
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# BLIP (English caption generator)
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-base"
).to(device)

# mBART (English -> Arabic translator)
mbart_model_path = "mbart_en_ar_model"  # path to your fine-tuned model
mbart_tokenizer = MBart50TokenizerFast.from_pretrained(mbart_model_path, src_lang="en_XX", tgt_lang="ar_AR")
mbart_model = MBartForConditionalGeneration.from_pretrained(mbart_model_path).to(device)

# Caption + Translation function
def generate_caption(image, language="en"):
    # Step 1: English caption
    inputs = blip_processor(images=image, return_tensors="pt").to(device)
    out = blip_model.generate(**inputs, max_length=64)
    english_caption = blip_processor.decode(out[0], skip_special_tokens=True)

    if language == "en":
        return english_caption
    else:
        # Step 2: Translate to Arabic
        inputs = mbart_tokenizer(english_caption, return_tensors="pt").to(device)
        translated_ids = mbart_model.generate(**inputs, max_length=64)
        arabic_caption = mbart_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
        return arabic_caption

# Gradio interface
iface = gr.Interface(
    fn=generate_caption,
    inputs=[
        gr.Image(type="pil", label="Input Image"),
        gr.Dropdown(["en", "ar"], label="Language")
    ],
    outputs=gr.Textbox(label="Generated Caption"),
    title="Image Captioning + Translation",
    description="Upload an image and get a caption in English or Arabic using BLIP and mBART."
)

iface.launch()


Using device: cuda


OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 25.12 MiB is free. Process 3897 has 15.86 GiB memory in use. Of the allocated memory 15.24 GiB is allocated by PyTorch, and 331.20 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [29]:
image_path = "/kaggle/input/cat-mn-google/Screenshot 2025-12-16 180019.png"  # replace with your image path
image = Image.open(image_path).convert("RGB")

# Generate caption
inputs = processor(images=image, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_length=64)

caption = processor.decode(outputs[0], skip_special_tokens=True)
print("Generated Caption:", caption)

Generated Caption: orange cat playing with soccer ball


In [7]:
def translate(text):
    model.eval()

    inputs = tokenizer(text, return_tensors="pt").to("cuda")

    generated = model.generate(
        **inputs,
        max_length=64,
        num_beams=4
    )

    return tokenizer.batch_decode(generated, skip_special_tokens=True)[0]

print(translate("Visiting relatives can be annoying, especially when they overstay their welcome"))


زيارة الاقرباء قد تكون مزعجة، خصوصا عندما يتأخرون عن مرحب بهم
