## Image Caption Generation

In [1]:
import os
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch
import pandas as pd


# Model setup: BLIP2 Flan T5-xxl
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xxl")
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-flan-t5-xxl", torch_dtype=torch.float32)
model.to(device);

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [49]:
def create_caption(prompt, image):
    """Create caption for single image"""
    inputs = processor(image, text=prompt, return_tensors="pt").to(device, torch.float32)
    generated_ids = model.generate(**inputs, min_new_tokens=12, max_new_tokens=20, temperature=0)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    
    return generated_text

def caption_images(prompt, image_dir='./images/'):
    """Create caption for all images in directory"""
    captions = []
    for image_name in sorted(os.listdir(image_dir)):
        if image_name == '.DS_Store':
            continue
        image = Image.open(image_dir + image_name)
        caption = create_caption(prompt, image)
        captions.append(caption)
        print(f'{image_name}: {caption}')
    return captions

In [51]:
# Visual Question-Answering Prompt
prompt = "Question: What is the gender, age (young, adult, senior), and race of the person in this photo, \
and what are they doing? Answer:"
captions = caption_images(prompt)

pexels-akshar-dave-977971.png: male, young, indian, playing guitar, smiling, sitting
pexels-andrea-piacquadio-3768176.png: woman, adult, white, sitting at a desk, writing
pexels-andrea-piacquadio-3769999.png: woman, adult, white, preparing food, preparing food, cooking, cooking, cooking,
pexels-andrea-piacquadio-3782829.png: senior, white, male, holding a book, walking
pexels-andrea-piacquadio-3786748.png: female, adult, white, reading a book with a beagle
pexels-andrea-piacquadio-3814539.png: senior, white, male, sewing, working on a sewing machine
pexels-andrea-piacquadio-3967832.png: female, young, adult, and smiling while leaning on the edge of a pool
pexels-anna-keibalo-18018780.png: young, white, male, listening to music, headphones, outdoors
pexels-anna-shvets-5711880.png: male, adult, white, working on wood, making furniture
pexels-arun-thomas-2093578.png: female, young, asian, taking a photo with a camera
pexels-baihaki-hine-1883944.png: male, young, adult, and indonesian, and

In [52]:
# save generated captions
df = pd.DataFrame(captions, columns=['Caption'])
df.to_csv('../data/images.csv')