## Create PyTorch Dataset

In [None]:
!pip install -U bitsandbytes

# !!RESTART AND CLEAR CELL OUTPUTS AFTER UPDATING bitstandbytes

In [1]:
from skimage import io
import numpy as np
image = io.imread('/kaggle/input/rocov2/ROCOv2/train_images/train/ROCOv2_2023_train_000001.jpg')  # Replace with actual path
print(image.shape)  # Should show (height, width, channels)

(748, 682, 3)


In [2]:
import os
from PIL import Image
import torch
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torchvision import transforms
from skimage import io

import torch
from torch.utils.data import DataLoader
from transformers import Blip2ForConditionalGeneration
from torch.optim import AdamW
from tqdm import tqdm
from torchmetrics.text.rouge import ROUGEScore
from torchmetrics.text.bleu import BLEUScore
# from torchmetrics.text.meteor import METEORScore
# from torchmetrics.text.cider import CIDERScore
import os


class ROCODataset(Dataset):
    def __init__(
        self,
        root_dir: str,
        train: bool = False,
        valid: bool = False,
        test: bool = False,
        transform=None
    ):
        self.root_dir = root_dir
        self.transform = transform
        self.train = train
        self.valid = valid
        self.test = test

        self.train_dir = os.path.join(root_dir, 'train_images/train')
        self.val_dir = os.path.join(root_dir, 'valid_images/valid')
        self.test_dir = os.path.join(root_dir, 'test_images/test')

        self.captions = {}
        caption_files = {
            'train': f'{root_dir}/train_captions.csv',
            'valid': f'{root_dir}/valid_captions.csv',
            'test': f'{root_dir}/test_captions.csv'
        }

        for caption in caption_files:
            df = pd.read_csv(caption_files[caption])
            self.captions[caption] = df.set_index('ID').to_dict()['Caption']

        df = pd.read_csv(f'{root_dir}/cui_mapping.csv')
        self.cui_mapping = df.set_index('CUI').to_dict()['Canonical name']

        self.train_len = len(self.captions['train'])
        self.valid_len = len(self.captions['valid'])
        self.test_len = len(self.captions['test'])

    def __len__(self):
        length = 0
        if self.train:
            length += self.train_len
        if self.valid:
            length += self.valid_len
        if self.test:
            length += self.test_len
        return length

    def __getitem__(self, idx) -> dict:
        file_name = ''
        image_loc = ''
        caption = ''

        if self.train:
            file_name = list(self.captions['train'].keys())[idx]
            caption = self.captions['train'][file_name]
            image_loc = os.path.join(self.root_dir, 'train_images/train', f'{file_name}.jpg')
        elif self.valid:
            file_name = list(self.captions['valid'].keys())[idx]
            caption = self.captions['valid'][file_name]
            image_loc = os.path.join(self.root_dir, 'valid_images/valid', f'{file_name}.jpg')
        elif self.test:
            file_name = list(self.captions['test'].keys())[idx]
            caption = self.captions['test'][file_name]
            image_loc = os.path.join(self.root_dir, 'test_images/test', f'{file_name}.jpg')

        img = io.imread(image_loc)

        if self.transform:
            img = self.transform(img)

        return {'image': img, 'text': caption}


# # Define a transform to resize images to 256x256 pixels
# data_transform = transforms.Compose([
#     transforms.ToPILImage(),  # Convert numpy array to PIL image
#     transforms.Resize((224, 224)),  # Resize the image to 256x256 pixels
#     transforms.ToTensor(),  # Convert PIL image to tensor
    # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
# ])

root_dir = "/kaggle/input/rocov2/ROCOv2"
train_data = ROCODataset(root_dir, train=True)
valid_data = ROCODataset(root_dir, valid=True)
test_data = ROCODataset(root_dir, test=True)

2025-07-05 11:44:21.081449: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751715861.305001      88 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751715861.367623      88 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
from torch.utils.data import Dataset, DataLoader

"""class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.processor(images=item["image"], padding="max_length", return_tensors="pt")
        # remove batch dimension
        encoding = {k: v.squeeze() for k, v in encoding.items()}
        encoding["text"] = item["text"]
        return encoding"""
from PIL import Image
import numpy as np

class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        image = item["image"]

        # NumPy array ise PIL image'e çevir
        if isinstance(image, np.ndarray):
            image = Image.fromarray(image)

        # Eğer grayscale ya da RGBA ise, RGB'ye çevir
        if image.mode != "RGB":
            image = image.convert("RGB")

        # Process image (not text yet)
        encoding = self.processor(images=image, padding="max_length", return_tensors="pt")
        encoding = {k: v.squeeze() for k, v in encoding.items()}
        encoding["text"] = item["text"]
        return encoding


def collate_fn(batch):
    # pad the input_ids and attention_mask
    processed_batch = {}
    for key in batch[0].keys():
        if key != "text":
            processed_batch[key] = torch.stack([example[key] for example in batch])
        else:
            text_inputs = processor.tokenizer(
                [example["text"] for example in batch], padding=True, return_tensors="pt"
            )
            processed_batch["input_ids"] = text_inputs["input_ids"]
            processed_batch["attention_mask"] = text_inputs["attention_mask"]
    return processed_batch


## Load model and processor

In [4]:
from transformers import AutoProcessor, Blip2ForConditionalGeneration, BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(load_in_8bit=True,
                                         llm_int8_threshold=200.0)

processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-6.7b")

model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-6.7b",
    #device_map="auto",
    device_map={"": 0},
    torch_dtype=torch.float32,
    quantization_config=quantization_config
)

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/882 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/987 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/9.93G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/9.93G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/9.80G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

In [5]:
train_dataset = ImageCaptioningDataset(train_data, processor)
test_dataset = ImageCaptioningDataset(test_data, processor)
valid_dataset = ImageCaptioningDataset(valid_data, processor)

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=3, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=3, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_dataset, shuffle=True, batch_size=3, collate_fn=collate_fn)

In [13]:
import torch
import gc

def clear_gpu_memory():

    # Python tarafında çöp topla
    gc.collect()

    # PyTorch’un rezerve ettiği kullanılmayan belleği serbest bırak
    torch.cuda.empty_cache()

    # PyTorch’un GPU memory allocator’ını resetle
    torch.cuda.ipc_collect()

for i in range(50):    
    clear_gpu_memory()


In [7]:
from peft import LoraConfig, get_peft_model

# Let's define the LoraConfig
config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "k_proj"]
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 8,388,608 || all params: 7,761,257,984 || trainable%: 0.1081


# SU ANLIK GEREKSIZ

In [8]:
# load weights from a epoch
fine_tuned_weights = torch.load("saved_models_BIIG2/model_epoch_1.pth")
model.load_state_dict(fine_tuned_weights)

FileNotFoundError: [Errno 2] No such file or directory: 'saved_models_BIIG2/model_epoch_1.pth'

## Train the model

model trained with 100 images

In [24]:
for i in range(50):    
    clear_gpu_memory()


In [16]:
import torch
from torch.utils.data import DataLoader, Subset
from tqdm import tqdm
import os
import gc

# GPU temizleme fonksiyonu
def clear_gpu_memory():
    torch.cuda.empty_cache()
    gc.collect()

# Cihaz ayarı
device = torch.device("cuda:0")
model = model.to(device)

# Dataset alt kümesi
train_subset = Subset(train_dataset, list(range(100)))
train_dataloader = DataLoader(train_subset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Modeli eğitim moduna al
model.train()

# Kayıt dizini
save_dir = "saved_models_BIIG"
os.makedirs(save_dir, exist_ok=True)
clear_gpu_memory()
# Eğitim döngüsü
for epoch in range(2):  
    print(f"Epoch: {epoch}")
    epoch_iterator = tqdm(train_dataloader, desc="Iteration")

    for batch in epoch_iterator:
        input_ids = batch["input_ids"].to(device)
        pixel_values = batch["pixel_values"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)
        loss = outputs.loss

        epoch_iterator.set_postfix(loss=loss.item())

        loss.backward()
        optimizer.step()

    torch.save(model.state_dict(), os.path.join(save_dir, f"model_epoch_{epoch}.pth"))


Epoch: 0


Iteration: 100%|██████████| 50/50 [00:54<00:00,  1.08s/it, loss=9.26]


Epoch: 1


Iteration: 100%|██████████| 50/50 [00:54<00:00,  1.10s/it, loss=10]  


### part 1

In [None]:
import torch
from tqdm import tqdm
import os

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Determine device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Set the model to training mode
model.train()

# Directory to save models
save_dir = "saved_models_BIIG"
os.makedirs(save_dir, exist_ok=True)

# Training loop with progress bar and model saving
for epoch in range(5):
    clear_gpu_memory()

    print(f"Epoch: {epoch}")
    epoch_iterator = tqdm(train_dataloader, desc="Iteration")

    for idx, batch in enumerate(epoch_iterator):
        clear_gpu_memory()
        input_ids = batch["input_ids"].to(device)
        pixel_values = batch["pixel_values"].to(device)  # Removed float16

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)
        loss = outputs.loss

        epoch_iterator.set_postfix(loss=loss.item())

        loss.backward()
        optimizer.step()

    torch.save(model.state_dict(), os.path.join(save_dir, f"model_epoch_{epoch}.pth"))


### part       2

In [None]:
import torch
from tqdm import tqdm
import os

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Determine device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Set the model to training mode
model.train()

# Directory to save models
save_dir = "saved_models_BIIG2"
os.makedirs(save_dir, exist_ok=True)

# Training loop with progress bar and model saving
for epoch in range(2):
    print(f"Epoch: {epoch}")
    epoch_iterator = tqdm(train_dataloader, desc="Iteration")
    
    for idx, batch in enumerate(epoch_iterator):
        input_ids = batch["input_ids"].to(device)
        pixel_values = batch["pixel_values"].to(device, torch.float16)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)
        loss = outputs.loss
        
        epoch_iterator.set_postfix(loss=loss.item())
        
        loss.backward()
        optimizer.step()
    
    # Save model after each epoch
    torch.save(model.state_dict(), os.path.join(save_dir, f"model_epoch_{epoch}.pth"))

# ME

In [18]:
model.eval()
def img_to_cap(img,model,processor,device):
    inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
    pixel_values = inputs.pixel_values

    generated_ids = model.generate(pixel_values=pixel_values, max_length=25)
    generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_caption

In [23]:
predictions = []
references = []

# Initialize counters and batch size
batch_size = 100
file_index = 0

# Process each item in the test data
for idx in tqdm(range(len(test_data))):
    # Get the image and caption
    image = valid_data[idx]['image']
    caption = valid_data[idx]['text']
    
    # Generate the prediction
    gen_ = img_to_cap(image, model, processor, 'cuda')
    
    # Append the prediction and reference to lists
    predictions.append(gen_)
    references.append([caption])
    
    # Every 100 examples, save the results to a text file
    if (idx + 1) % batch_size == 0 or (idx + 1) == len(test_data):
        # Define the file names for saving
        pred_filename = f'predictions_batch_{file_index}.txt'
        ref_filename = f'references_batch_{file_index}.txt'
        
        # Save predictions
        with open(pred_filename, 'w') as pred_file:
            for pred in predictions:
                pred_file.write(pred + '\n')
        
        # Save references
        with open(ref_filename, 'w') as ref_file:
            for ref_list in references:
                ref_file.write('\t'.join(ref_list) + '\n')
        
        
        # Increment file index
        file_index += 1

print("Processing and saving completed.")

# for idx in tqdm(range(len(test_data))):
#     image = valid_data[idx]['image'].to(device)
#     caption = valid_data[idx]['text'].to(device)
#     gen_ = img_to_cap(image,model, processor, 'cuda')

#     predictions.append(gen_)
#     references.append([caption])  

  1%|          | 99/9927 [02:36<4:18:34,  1.58s/it]


OSError: [Errno 28] No space left on device

In [None]:
import json

def process_predictions(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    predictions = [{"image_id": str(index), "caption": line.strip()} for index, line in enumerate(lines)]
    
    with open('captions_example.json', 'w') as json_file:
        json.dump(predictions, json_file, indent=4)

def process_references(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    images = [{"id": str(index)} for index in range(len(lines))]
    annotations = [{"image_id": str(index), "id": str(index), "caption": line.strip()} for index, line in enumerate(lines)]
    
    references = {
        "images": images,
        "annotations": annotations
    }
    
    with open('references_example.json', 'w') as json_file:
        json.dump(references, json_file, indent=4)

# Paths to the text files
predictions_file_path = '/home/CinCin/code/predictions_batch_65.txt'
references_file_path = '/home/CinCin/code/references_batch_65.txt'

# Process the files and create JSON files
process_predictions(predictions_file_path)
process_references(references_file_path)


zeynep codes

In [27]:
for i in range(50):    
    clear_gpu_memory()


In [22]:
from PIL import Image
import numpy as np

def img_to_cap(img, model, processor, device):
    model.eval()

    # Eğer image NumPy array ise ve 3 kanallı değilse, RGB'ye çevir
    if isinstance(img, np.ndarray):
        if img.ndim == 2:  # grayscale
            img = np.stack([img]*3, axis=-1)  # grayscale -> RGB
        img = Image.fromarray(img.astype('uint8')).convert("RGB")

    # Görüntüyü modele verilecek formata getir
    inputs = processor(images=img, return_tensors="pt").to(device)
    
    # Görüntüden caption üret
    with torch.no_grad():
        generated_ids = model.generate(**inputs)
        caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return caption


In [28]:
# Yalnızca ilk 10 görüntüyü işle
num_examples_to_try = 10  # İstersen bunu 5 veya başka bir sayı yapabilirsin

predictions = []
references = []

batch_size = 5  # Küçük tut, her 5 örnekte bir dosya kaydedilecek
file_index = 0

# Sınırlı sayıda veri üzerinde çalış
for idx in tqdm(range(num_examples_to_try)):
    image = valid_data[idx]['image']
    caption = valid_data[idx]['text']
    
    gen_ = img_to_cap(image, model, processor, 'cuda')
    
    predictions.append(gen_)
    references.append([caption])
    
    if (idx + 1) % batch_size == 0 or (idx + 1) == num_examples_to_try:
        pred_filename = f'predictions_batch_{file_index}.txt'
        ref_filename = f'references_batch_{file_index}.txt'
        
        with open(pred_filename, 'w') as pred_file:
            for pred in predictions:
                pred_file.write(pred + '\n')
        
        with open(ref_filename, 'w') as ref_file:
            for ref_list in references:
                ref_file.write('\t'.join(ref_list) + '\n')
        
        file_index += 1

print("Processing and saving completed.")


 40%|████      | 4/10 [00:07<00:10,  1.80s/it]


OSError: [Errno 28] No space left on device

In [None]:
# import pymeteor.pymeteor as pymeteor
# # metoer
# sumup = 0
# for idx in range(len(references)):
#     reference = references[idx][0]

#     meteor_score = pymeteor.meteor(reference, predictions[ref])
#     sumup += meteor_score
# print(sumup/len(references))

In [None]:
for idx in tqdm(range(len(test_data))):
    # Get the image and caption
    image = valid_data[idx]['image']
    caption = valid_data[idx]['text']
    
    # Generate the prediction
    gen_ = img_to_cap(image, model, processor, 'cuda')

In [None]:
plt.imshow(valid_data[1]['image'])

image = valid_data[1]['image']
caption = valid_data[1]['text']

# Generate the prediction
gen_ = img_to_cap(image, model, processor, 'cuda')

print('source:\n',caption)
print('gen:\n',gen_)

In [None]:
idx_ = 20
plt.imshow(valid_data[idx_]['image'])

image = valid_data[idx_]['image']
caption = valid_data[idx_]['text']

# Generate the prediction
gen_ = img_to_cap(image, model, processor, 'cuda')

print('source:\n',caption)
print('gen:\n',gen_)

In [None]:
idx_ = 180
plt.imshow(valid_data[idx_]['image'])

image = valid_data[idx_]['image']
caption = valid_data[idx_]['text']

# Generate the prediction
gen_ = img_to_cap(image, model, processor, 'cuda')

print('source:\n',caption)
print('gen:\n',gen_)

In [None]:
idx_ = 499
plt.imshow(valid_data[idx_]['image'])

image = valid_data[idx_]['image']
caption = valid_data[idx_]['text']

# Generate the prediction
gen_ = img_to_cap(image, model, processor, 'cuda')

print('source:\n',caption)
print('gen:\n',gen_)

In [None]:
! python /home/CinCin/bleu-rouge-meteor-cider-spice-eval4imagecaption/example/main.py

## Inference

In [None]:
import torch
from matplotlib import pyplot as plt

device = "cuda" if torch.cuda.is_available() else "cpu"

fig = plt.figure(figsize=(18, 14))

# prepare image for the model
for i, example in enumerate(test_data):
  image = example["image"]
  inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
  pixel_values = inputs.pixel_values

  generated_ids = model.generate(pixel_values=pixel_values, max_length=25)
  generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
  fig.add_subplot(2, 3, i+1)
  plt.imshow(image)
  plt.axis("off")
  plt.title(f"Generated caption: {generated_caption}")