## Create PyTorch Dataset

In [None]:
!pip install -U bitsandbytes

# !!RESTART AND CLEAR CELL OUTPUTS AFTER UPDATING bitstandbytes

In [9]:
import os
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torchvision import transforms
from skimage import io
from transformers import Blip2ForConditionalGeneration
from torch.optim import AdamW
from tqdm import tqdm
from torchmetrics.text.rouge import ROUGEScore
from torchmetrics.text.bleu import BLEUScore

class ROCODataset(Dataset):
    def __init__(
        self,
        root_dir: str,
        split: str = "train",  # Changed to single split parameter
        transform=None
    ):
        self.root_dir = root_dir
        self.transform = transform
        self.split = split
        
        # Set directories based on split
        if split == "train":
            self.image_dir = os.path.join(root_dir, 'train_images/train')
            caption_file = f'{root_dir}/train_captions.csv'
        elif split == "valid":
            self.image_dir = os.path.join(root_dir, 'valid_images/valid')
            caption_file = f'{root_dir}/valid_captions.csv'
        elif split == "test":
            self.image_dir = os.path.join(root_dir, 'test_images/test')
            caption_file = f'{root_dir}/test_captions.csv'
        else:
            raise ValueError(f"Invalid split: {split}. Must be 'train', 'valid', or 'test'")
        
        # Load captions for this split only
        self.captions_df = pd.read_csv(caption_file)
        self.image_ids = self.captions_df['ID'].tolist()
        self.captions = self.captions_df['Caption'].tolist()
        
        # Load CUI mapping (if needed)
        cui_file = f'{root_dir}/cui_mapping.csv'
        if os.path.exists(cui_file):
            df = pd.read_csv(cui_file)
            self.cui_mapping = df.set_index('CUI').to_dict()['Canonical name']
        else:
            self.cui_mapping = {}
    
    def __len__(self):
        return len(self.image_ids)
    
    def __getitem__(self, idx) -> dict:
        image_id = self.image_ids[idx]
        caption = self.captions[idx]
        
        # Construct image path
        image_path = os.path.join(self.image_dir, f'{image_id}.jpg')
        
        # Load image
        try:
            img = io.imread(image_path)
        except Exception as e:
            print(f"Error loading image {image_path}: {e}")
            # Return a dummy image in case of error
            img = np.zeros((224, 224, 3), dtype=np.uint8)
        
        if self.transform:
            img = self.transform(img)
        
        return {'image': img, 'text': caption}

# Initialize datasets
root_dir = "/kaggle/input/rocov2/ROCOv2"
train_data = ROCODataset(root_dir, split="train")
valid_data = ROCODataset(root_dir, split="valid")
test_data = ROCODataset(root_dir, split="test")

Loaded 59958 train captions from /kaggle/input/ROCOv2/train_captions.csv
Loaded 9904 valid captions from /kaggle/input/ROCOv2/valid_captions.csv
Loaded 9927 test captions from /kaggle/input/ROCOv2/test_captions.csv
Dataset lengths - Train: 59958, Valid: 0, Test: 0
Dataset lengths - Train: 0, Valid: 9904, Test: 0
Dataset lengths - Train: 0, Valid: 0, Test: 9927


7

In [10]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np
import gc

class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        item = self.dataset[idx]
        image = item["image"]
        
        # NumPy array ise PIL image'e çevir
        if isinstance(image, np.ndarray):
            image = Image.fromarray(image)
        
        # Eğer grayscale ya da RGBA ise, RGB'ye çevir
        if image.mode != "RGB":
            image = image.convert("RGB")
        
        # Resize image to reduce memory usage
        max_size = 384  # Reduce from default 224 if needed
        if max(image.size) > max_size:
            image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
        
        # Process image and text
        encoding = self.processor(images=image, text=item["text"], padding="max_length", return_tensors="pt")
        encoding = {k: v.squeeze() for k, v in encoding.items()}
        encoding["text"] = item["text"]  # Keep original text for labels
        
        # Clean up
        del image
        return encoding

def collate_fn(batch):
    # pad the input_ids and attention_mask
    processed_batch = {}
    for key in batch[0].keys():
        if key == "text":
            # Tokenize all texts and keep as labels
            text_inputs = processor.tokenizer(
                [example["text"] for example in batch], 
                padding=True, 
                truncation=True,
                max_length=256,
                return_tensors="pt"
            )
            processed_batch["labels"] = text_inputs["input_ids"]  # Unshifted labels
            processed_batch["label_attention_mask"] = text_inputs["attention_mask"]
        elif key != "input_ids" and key != "attention_mask":  # Skip processed keys
            processed_batch[key] = torch.stack([example[key] for example in batch])
    
    # Process image encodings
    if "pixel_values" in batch[0]:
        processed_batch["pixel_values"] = torch.stack([example["pixel_values"] for example in batch])
    
    # Clean up batch
    del batch
    gc.collect()
    
    return processed_batch

## Load model and processor

In [11]:
from transformers import AutoProcessor, Blip2ForConditionalGeneration, BitsAndBytesConfig
import torch
import gc

# Clear memory before loading model
gc.collect()
torch.cuda.empty_cache()

# More aggressive 4-bit quantization for better memory efficiency
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b", use_fast=True)

# Load model with low CPU memory usage - FORCE SINGLE GPU
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    torch_dtype=torch.float16,
    quantization_config=quantization_config,
    trust_remote_code=True,
    low_cpu_mem_usage=True,  # Added for memory efficiency
    device_map={"": "cuda:0"}  # Force everything to cuda:0
)

# Enable gradient checkpointing for memory efficiency
if hasattr(model, 'gradient_checkpointing_enable'):
    model.gradient_checkpointing_enable()

print("Model loaded successfully!")
print(f"Model is on device: {next(model.parameters()).device}")

# Clear memory after loading
gc.collect()
torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [12]:
import torch
import gc

# Enhanced memory clearing function
def clear_gpu_memory():
    """Aggressively clear GPU memory"""
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        # Force garbage collection multiple times
        for _ in range(3):
            gc.collect()

train_dataset = ImageCaptioningDataset(train_data, processor)
test_dataset = ImageCaptioningDataset(test_data, processor)
valid_dataset = ImageCaptioningDataset(valid_data, processor)

# REDUCED batch size for memory efficiency
train_dataloader = DataLoader(
    train_dataset, 
    shuffle=True, 
    batch_size=2,  # Reduced from 8 to 2
    collate_fn=collate_fn, 
    num_workers=2,  # Reduced from 4 to 2
    pin_memory=False,  # Disabled to save memory
    persistent_workers=False  # Don't keep workers alive
)

test_dataloader = DataLoader(
    test_dataset, 
    shuffle=False, 
    batch_size=2,  # Reduced from 8 to 2
    collate_fn=collate_fn, 
    num_workers=2,  # Reduced from 4 to 2
    pin_memory=False,  # Disabled to save memory
    persistent_workers=False
)

valid_dataloader = DataLoader(
    valid_dataset, 
    shuffle=False, 
    batch_size=2,  # Reduced from 8 to 2
    collate_fn=collate_fn, 
    num_workers=2,  # Reduced from 4 to 2
    pin_memory=False,  # Disabled to save memory
    persistent_workers=False
)

# Clear memory once
clear_gpu_memory()

from peft import LoraConfig, get_peft_model

# Reduced LoRA config for memory efficiency
config = LoraConfig(
    r=16,  # Reduced from 64 to 16
    lora_alpha=32,  # Reduced from 128 to 32
    lora_dropout=0.1,  # Slightly increased dropout
    bias="none",
    target_modules=["q_proj", "v_proj", "k_proj", "out_proj"]  # Reduced target modules
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

# Clear memory after LoRA setup
clear_gpu_memory()

trainable params: 10,485,760 || all params: 3,755,247,616 || trainable%: 0.2792


## Train the model

### TEST TRAIN WITH ONLY 500 IMAGES

In [None]:
import torch
from torch.utils.data import Subset
from tqdm import tqdm
import os

# Create subset with only 500 images for testing
train_subset = Subset(train_dataset, range(500))
train_test_dataloader = DataLoader(train_subset, shuffle=True, batch_size=4, collate_fn=collate_fn)

# Better optimizer with weight decay
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Determine device - use single GPU
device = "cuda:0"
print(f"Using device: {device}")

# Move model to single GPU to avoid device mismatch
model = model.to(device)

# Set the model to training mode
model.train()

# Directory to save test models
save_dir = "test_models_500"
os.makedirs(save_dir, exist_ok=True)

print(f"Training on {len(train_subset)} images for testing...")

# Test training loop with only 1 epoch
for epoch in range(1):
    print(f"Test Epoch: {epoch}")
    epoch_iterator = tqdm(train_test_dataloader, desc="Test Training")
    
    total_loss = 0
    num_batches = 0
    
    for idx, batch in enumerate(epoch_iterator):
        # Ensure all tensors are on the same device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        pixel_values = batch["pixel_values"].to(device)
        
        optimizer.zero_grad()
        
        # Use mixed precision for better performance
        with torch.amp.autocast('cuda'):
            outputs = model(
                input_ids=input_ids,
                pixel_values=pixel_values,
                attention_mask=attention_mask,
                labels=input_ids
            )
            loss = outputs.loss
        
        epoch_iterator.set_postfix(loss=loss.item())
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        num_batches += 1
        
        # Clear memory every 5 steps
        if idx % 5 == 0:
            clear_gpu_memory()
    
    # Save test model
    torch.save(model.state_dict(), os.path.join(save_dir, f"test_model_epoch_{epoch}.pth"))
    print(f"Test model saved for epoch {epoch}")
    print(f"Average loss: {total_loss / num_batches:.4f}")

print("Test training completed successfully!")

### FULL TRAIN

In [13]:
import torch
from tqdm import tqdm
import os

# Fix the deprecated warnings by using the new API
scaler = torch.amp.GradScaler('cuda')

# Better optimizer with lower learning rate for stability
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.01, eps=1e-8)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=3, eta_min=1e-6)

# Determine device
device = next(model.parameters()).device
print(f"Using device: {device}")

# Set the model to training mode
model.train()

# Directory to save models
save_dir = "saved_models_OPTIMIZED"
os.makedirs(save_dir, exist_ok=True)

# MEMORY OPTIMIZATION SETTINGS
gradient_accumulation_steps = 16
epochs = 1
max_steps_per_clear = 2

# Enable memory efficient settings
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

print("Starting training with aggressive memory optimizations...")
clear_gpu_memory()

for epoch in range(epochs):
    print(f"Epoch: {epoch+1}/{epochs}")
    epoch_iterator = tqdm(train_dataloader, desc="Training")
    
    total_loss = 0
    optimizer.zero_grad()
    
    for idx, batch in enumerate(epoch_iterator):
        try:
            # Ensure all tensors are on the same device with non_blocking transfer
            # In the training loop, replace the label setup with:
            input_ids = batch["input_ids"][:, :-1].contiguous()  # Shift left for input
            attention_mask = batch["attention_mask"][:, :-1].contiguous()
            labels = batch["labels"][:, 1:].contiguous()  # Shift right for labels
            labels = torch.where(batch["labels"][:, :-1].contiguous() == processor.tokenizer.pad_token_id, -100, labels)
            pixel_values = batch["pixel_values"]
            # Mixed precision training with memory optimization
            with torch.amp.autocast('cuda', dtype=torch.float16):
                outputs = model(
                    input_ids=input_ids[:, :-1].contiguous(),  # Shift left for input
                    pixel_values=pixel_values,
                    attention_mask=attention_mask[:, :-1].contiguous(),  # Adjust mask
                    labels=labels
                )
                loss = outputs.loss / gradient_accumulation_steps
            
            # Debug: Check if loss has gradient
            if loss.grad_fn is None:
                print(f"Warning: Loss at step {idx} has no grad_fn. Skipping batch.")
                clear_gpu_memory()
                optimizer.zero_grad()
                continue
            
            # Backward pass with gradient scaling
            scaler.scale(loss).backward()
            
            total_loss += loss.item()
            epoch_iterator.set_postfix(loss=loss.item())
            
            # CRITICAL: Delete intermediate tensors immediately
            del input_ids, attention_mask, pixel_values, outputs, loss
            
            # Gradient accumulation with frequent memory clearing
            if (idx + 1) % gradient_accumulation_steps == 0:
                # Clip gradients to prevent explosion
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                
                # Clear memory after optimizer step
                clear_gpu_memory()
            
            # Very frequent memory clearing
            if idx % max_steps_per_clear == 0:
                clear_gpu_memory()
                
        except RuntimeError as e:
            if "out of memory" in str(e):
                print(f"OOM at step {idx}. Clearing memory and skipping batch...")
                clear_gpu_memory()
                optimizer.zero_grad()
                continue
            else:
                raise e
    
    # Step scheduler
    scheduler.step()
    
    # Clear memory before saving
    clear_gpu_memory()
    
    # Save model after each epoch
    torch.save(model.state_dict(), os.path.join(save_dir, f"model_epoch_{epoch}.pth"))
    print(f"Model saved for epoch {epoch+1}")
    print(f"Average loss: {total_loss / len(train_dataloader):.4f}")
    print(f"Learning rate: {scheduler.get_last_lr()[0]:.6f}")
    
    # Final memory clear
    clear_gpu_memory()

print("Training completed!")

Using device: cuda:0
Starting training with aggressive memory optimizations...
Epoch: 1/1



Training:   0%|          | 0/29979 [00:00<?, ?it/s][AAsking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Training:   0%|          | 0/29979 [00:01<?, ?it/s]


KeyError: 'input_ids'

# Evaluation

In [None]:
import torch
import random
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
from torch.utils.data import DataLoader
import os

def load_model_smart():
    """Load model from VRAM if available, otherwise from saved file"""
    global model
    try:
        # Check if model is already in VRAM and accessible
        if 'model' in globals() and model is not None:
            print("Model found in VRAM, using existing model...")
            return model
        else:
            raise NameError("Model not in VRAM")
    except:
        print("Model not found in VRAM, loading from saved file...")
        # Load from saved file
        from transformers import AutoProcessor, Blip2ForConditionalGeneration, BitsAndBytesConfig
        from peft import LoraConfig, get_peft_model
        
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
        )
        
        processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b", use_fast=True)
        
        # Comprehensive device map for multi-GPU
        device_map = {
            "vision_model": 0,
            "qformer": 0,
            "language_model": 0,
            "language_projection": 0,
            "qformer.bert": 0,
            "qformer.bert.embeddings": 0,
            "qformer.bert.encoder": 0,
            "qformer.bert.encoder.layer": 0,
            "qformer.layernorm": 0,
            "qformer.dropout": 0,
        }
        
        base_model = Blip2ForConditionalGeneration.from_pretrained(
            "Salesforce/blip2-opt-2.7b",
            device_map=device_map,
            torch_dtype=torch.float16,
            quantization_config=quantization_config,
            trust_remote_code=True
        )
        
        config = LoraConfig(
            r=64,
            lora_alpha=128,
            lora_dropout=0.05,
            bias="none",
            target_modules=["q_proj", "v_proj", "k_proj", "out_proj", "fc1", "fc2", "gate_proj", "up_proj", "down_proj"]
        )
        
        model = get_peft_model(base_model, config)
        
        # Load the saved weights
        model_path = "saved_models_OPTIMIZED/model_epoch_4.pth"  # Updated path
        if os.path.exists(model_path):
            model.load_state_dict(torch.load(model_path, map_location='cuda'))
            print(f"Model loaded from {model_path}")
        else:
            print("Saved model not found, using base model")
        
        return model

def generate_caption_with_config(model, processor, pixel_values, config):
    """Generate caption with specific generation config"""
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    with torch.no_grad():
        try:
            with torch.cuda.amp.autocast():  # Added mixed precision
                generated_ids = model.generate(
                    pixel_values=pixel_values.unsqueeze(0).to(device),
                    **config,
                    early_stopping=False
                )
            caption = processor.decode(generated_ids[0], skip_special_tokens=True)
            # Enhanced filtering
            words = caption.split()
            unique_words = []
            for word in words:
                if word not in unique_words[-3:]:  # Avoid repeating last 3 words
                    if not (word.startswith('[') and word.endswith(']')) and not word.isdigit():
                        unique_words.append(word)
            caption = " ".join(unique_words)
            return caption
        except Exception as e:
            return f"Error - {str(e)}"

# Enhanced generation configs
configs = [
    {
        "max_length": 250,
        "min_length": 60,
        "num_beams": 8,
        "temperature": 0.7,
        "do_sample": True,
        "top_p": 0.9,
        "repetition_penalty": 2.0,
        "length_penalty": 2.0,
        "no_repeat_ngram_size": 3,
        "pad_token_id": processor.tokenizer.pad_token_id,
        "eos_token_id": processor.tokenizer.eos_token_id
    },
    {
        "max_length": 300,
        "min_length": 80,
        "num_beams": 10,
        "temperature": 1.0,
        "do_sample": True,
        "top_k": 50,
        "repetition_penalty": 2.2,
        "length_penalty": 2.2,
        "no_repeat_ngram_size": 3,
        "pad_token_id": processor.tokenizer.pad_token_id,
        "eos_token_id": processor.tokenizer.eos_token_id
    },
    {
        "max_length": 200,
        "min_length": 50,
        "num_beams": 6,
        "temperature": 0.6,
        "do_sample": True,
        "top_p": 0.8,
        "repetition_penalty": 2.0,
        "length_penalty": 1.8,
        "no_repeat_ngram_size": 3,
        "pad_token_id": processor.tokenizer.pad_token_id,
        "eos_token_id": processor.tokenizer.eos_token_id
    }
]

# Get 10 random images from the raw test data
random_indices = random.sample(range(len(test_data)), 10)

print("Evaluating optimized model with 10 random images...")
print("=" * 60)

for i, idx in enumerate(random_indices):
    clear_gpu_memory()
    
    # Get image and original caption from raw dataset
    raw_sample = test_data[idx]
    image = raw_sample['image']
    original_caption = raw_sample['text']
    
    # Convert numpy array to PIL image for display
    if isinstance(image, np.ndarray):
        image_pil = Image.fromarray(image)
    else:
        image_pil = image
    
    # Ensure RGB format
    if image_pil.mode != "RGB":
        image_pil = image_pil.convert("RGB")
    
    # Process image for model input
    inputs = processor(images=image_pil, return_tensors="pt")
    pixel_values = inputs['pixel_values'].squeeze(0)
    
    # Display image
    plt.figure(figsize=(8, 6))
    plt.imshow(image_pil)
    plt.axis('off')
    plt.title(f"Test Image {i+1}")
    plt.show()
    
    # Generate captions with different configs
    print(f"\nImage {i+1} Results:")
    print("-" * 40)
    
    for j, config in enumerate(configs):
        try:
            caption = generate_caption_with_config(model, processor, pixel_values, config)
            print(f"Caption {j+1}: {caption}")
        except Exception as e:
            print(f"Caption {j+1}: Error - {str(e)}")
    
    print(f"Original: {original_caption}")
    print("=" * 60)

print("Enhanced visual evaluation completed!")

# BLEU Evaluation

In [None]:
import torch
from torchmetrics.text.bleu import BLEUScore
from tqdm import tqdm
import random

def evaluate_model_bleu(model, processor, test_dataloader, num_samples=100):
    """Evaluate model using BLEU score"""
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.eval()
    
    # Initialize BLEU metric
    bleu = BLEUScore()
    
    generated_captions = []
    reference_captions = []
    
    print(f"Evaluating BLEU score on {num_samples} samples...")
    
    # Get random samples for evaluation
    all_samples = []
    for batch in test_dataloader:
        for i in range(len(batch['pixel_values'])):
            sample = {
                'pixel_values': batch['pixel_values'][i].unsqueeze(0),
                'text': batch['input_ids'][i]
            }
            all_samples.append(sample)
            if len(all_samples) >= num_samples:
                break
        if len(all_samples) >= num_samples:
            break
    
    # Evaluate samples
    for i, sample in enumerate(tqdm(all_samples[:num_samples], desc="Evaluating BLEU")):
        try:
            # Generate caption
            pixel_values = sample['pixel_values'].to(device)
            
            with torch.no_grad():
                generated_ids = model.generate(
                    pixel_values=pixel_values,
                    max_length=50,
                    num_beams=3,
                    temperature=0.7,
                    do_sample=True,
                    top_p=0.9,
                    pad_token_id=processor.tokenizer.pad_token_id
                )
            
            # Decode generated caption
            generated_caption = processor.decode(generated_ids[0], skip_special_tokens=True)
            
            # Decode reference caption
            reference_caption = processor.decode(sample['text'], skip_special_tokens=True)
            
            generated_captions.append(generated_caption)
            reference_captions.append([reference_caption])  # BLEU expects list of references
            
        except Exception as e:
            print(f"Error processing sample {i}: {str(e)}")
            continue
        
        # Clear memory every 10 samples
        if i % 10 == 0:
            clear_gpu_memory()
    
    # Calculate BLEU score
    if generated_captions and reference_captions:
        bleu_score = bleu(generated_captions, reference_captions)
        print(f"\nBLEU Score: {bleu_score.item():.4f}")
        
        # Show some examples
        print("\nExample predictions:")
        print("-" * 50)
        for i in range(min(5, len(generated_captions))):
            print(f"Generated: {generated_captions[i]}")
            print(f"Reference: {reference_captions[i][0]}")
            print("-" * 50)
    else:
        print("No valid samples for BLEU evaluation")
    
    return bleu_score.item() if generated_captions else 0.0

# Run BLEU evaluation
print("Starting BLEU evaluation...")
bleu_score = evaluate_model_bleu(model, processor, test_dataloader, num_samples=100)
print(f"Final BLEU Score: {bleu_score:.4f}")

zeynep codes

In [None]:
for i in range(50):    
    clear_gpu_memory()


In [None]:
from PIL import Image
import numpy as np

def img_to_cap(img, model, processor, device):
    model.eval()

    # Eğer image NumPy array ise ve 3 kanallı değilse, RGB'ye çevir
    if isinstance(img, np.ndarray):
        if img.ndim == 2:  # grayscale
            img = np.stack([img]*3, axis=-1)  # grayscale -> RGB
        img = Image.fromarray(img.astype('uint8')).convert("RGB")

    # Görüntüyü modele verilecek formata getir
    inputs = processor(images=img, return_tensors="pt").to(device)
    
    # Görüntüden caption üret
    with torch.no_grad():
        generated_ids = model.generate(**inputs)
        caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return caption


In [None]:
# Yalnızca ilk 10 görüntüyü işle
num_examples_to_try = 10  # İstersen bunu 5 veya başka bir sayı yapabilirsin

predictions = []
references = []

batch_size = 5  # Küçük tut, her 5 örnekte bir dosya kaydedilecek
file_index = 0

# Sınırlı sayıda veri üzerinde çalış
for idx in tqdm(range(num_examples_to_try)):
    image = valid_data[idx]['image']
    caption = valid_data[idx]['text']
    
    gen_ = img_to_cap(image, model, processor, 'cuda')
    
    predictions.append(gen_)
    references.append([caption])
    
    if (idx + 1) % batch_size == 0 or (idx + 1) == num_examples_to_try:
        pred_filename = f'predictions_batch_{file_index}.txt'
        ref_filename = f'references_batch_{file_index}.txt'
        
        with open(pred_filename, 'w') as pred_file:
            for pred in predictions:
                pred_file.write(pred + '\n')
        
        with open(ref_filename, 'w') as ref_file:
            for ref_list in references:
                ref_file.write('\t'.join(ref_list) + '\n')
        
        file_index += 1

print("Processing and saving completed.")


In [None]:
# import pymeteor.pymeteor as pymeteor
# # metoer
# sumup = 0
# for idx in range(len(references)):
#     reference = references[idx][0]

#     meteor_score = pymeteor.meteor(reference, predictions[ref])
#     sumup += meteor_score
# print(sumup/len(references))

In [None]:
for idx in tqdm(range(len(test_data))):
    # Get the image and caption
    image = valid_data[idx]['image']
    caption = valid_data[idx]['text']
    
    # Generate the prediction
    gen_ = img_to_cap(image, model, processor, 'cuda')

In [None]:
plt.imshow(valid_data[1]['image'])

image = valid_data[1]['image']
caption = valid_data[1]['text']

# Generate the prediction
gen_ = img_to_cap(image, model, processor, 'cuda')

print('source:\n',caption)
print('gen:\n',gen_)

In [None]:
idx_ = 20
plt.imshow(valid_data[idx_]['image'])

image = valid_data[idx_]['image']
caption = valid_data[idx_]['text']

# Generate the prediction
gen_ = img_to_cap(image, model, processor, 'cuda')

print('source:\n',caption)
print('gen:\n',gen_)

In [None]:
idx_ = 180
plt.imshow(valid_data[idx_]['image'])

image = valid_data[idx_]['image']
caption = valid_data[idx_]['text']

# Generate the prediction
gen_ = img_to_cap(image, model, processor, 'cuda')

print('source:\n',caption)
print('gen:\n',gen_)

In [None]:
idx_ = 499
plt.imshow(valid_data[idx_]['image'])

image = valid_data[idx_]['image']
caption = valid_data[idx_]['text']

# Generate the prediction
gen_ = img_to_cap(image, model, processor, 'cuda')

print('source:\n',caption)
print('gen:\n',gen_)

In [None]:
! python /home/CinCin/bleu-rouge-meteor-cider-spice-eval4imagecaption/example/main.py

## Inference

In [None]:
import torch
from matplotlib import pyplot as plt

device = "cuda" if torch.cuda.is_available() else "cpu"

fig = plt.figure(figsize=(18, 14))

# prepare image for the model
for i, example in enumerate(test_data):
  image = example["image"]
  inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
  pixel_values = inputs.pixel_values

  generated_ids = model.generate(pixel_values=pixel_values, max_length=25)
  generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
  fig.add_subplot(2, 3, i+1)
  plt.imshow(image)
  plt.axis("off")
  plt.title(f"Generated caption: {generated_caption}")