In [3]:
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image
import torch

def get_blip2_caption(image: Image.Image, model_name: str = "Salesforce/blip2-opt-2.7b-coco", max_length: int = 75) -> str:
    """
    Generate a caption for an image using BLIP-2.
    
    Args:
        image: PIL Image in RGB format
        model_name: BLIP-2 model to use
        max_length: Maximum length of generated caption in tokens
        
    Returns:
        str: Generated caption for the image
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Initialize model and processor
    processor = Blip2Processor.from_pretrained(model_name)
    model = Blip2ForConditionalGeneration.from_pretrained(model_name).to(device)
    
    # Prepare image and generate caption
    inputs = processor(images=image, return_tensors="pt").to(device)
    generated_ids = model.generate(**inputs, max_length=max_length)
    caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    
    return caption

In [4]:
get_blip2_caption(Image.open("/scratch/vonder2/kodak-dataset/06.png"))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Expanding inputs for image tokens in BLIP-2 should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.


'a boat is floating in the ocean near a shoreline'