In [None]:
import os
import csv
import torch
import pandas as pd
from PIL import Image
from tqdm import tqdm
from transformers import BlipProcessor, BlipForQuestionAnswering

VQA_CSV     = "../data/vqa.csv"
CURATED_DIR = "../data/curated_images"
PRED_CSV    = "../data/predictions.csv"

SEED        = 7
SAMPLE_SIZE = 10000
DEVICE      = "cuda" if torch.cuda.is_available() else "cpu"

df_sample = pd.read_csv(VQA_CSV).sample(n=SAMPLE_SIZE, random_state=SEED).reset_index(drop=True)

processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(DEVICE).eval()

with open(PRED_CSV, mode="w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["filename", "question", "answer", "prediction"])

    for _, row in tqdm(df_sample.iterrows(), total=len(df_sample), desc="VQA Inference"):
        filename, question, answer = row["filename"], row["question"], row["answer"]
        img_path = os.path.join(CURATED_DIR, filename)
        try:
            img = Image.open(img_path).convert("RGB")
            inputs = processor(images=img, text=question, return_tensors="pt").to(DEVICE)
            with torch.no_grad():
                out_ids = model.generate(**inputs, max_new_tokens=5)
            prediction = processor.decode(out_ids[0], skip_special_tokens=True).strip().lower()
        except:
            prediction = ""
        writer.writerow([filename, question, answer, prediction])