In [None]:
import os
import torch
import difflib
import pandas as pd
from tqdm import tqdm
from PIL import Image
from evaluate import load as load_metric
from transformers import BlipProcessor, BlipForQuestionAnswering
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

In [None]:
# 1) Configuration
VQA_CSV       = "../data/vqa.csv"
CURATED_DIR   = "../data/curated_images"
PRED_CSV      = "../data/predictions.csv"
SAMPLE_SIZE   = 10000
IMAGE_SIZE    = (256, 256)
DEVICE        = "cuda" if torch.cuda.is_available() else "cpu"
SEED          = 7

In [None]:
# 2) Load and sample
df = pd.read_csv(VQA_CSV)
df_sample = df.sample(n=SAMPLE_SIZE, random_state=SEED).reset_index(drop=True)
questions = df_sample[["filename", "question", "answer"]]

In [None]:
# 3) Load BLIP-VQA model & processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
model.to(DEVICE).eval()

In [None]:
# 4) Prepare metrics
bertscore = load_metric("bertscore")
rouge     = load_metric("rouge")
meteor    = load_metric("meteor")

In [None]:
# 5) Inference loop
predictions = []
references  = []

for _, row in tqdm(questions.iterrows(), total=len(questions), desc="VQA Inference"):
    fn, q, a = row["filename"], row["question"], row["answer"]
    img_path = os.path.join(CURATED_DIR, fn)
    try:
        img = Image.open(img_path).convert("RGB").resize(IMAGE_SIZE)
    except:
        predictions.append("")
        references.append(a.lower())
        continue

    inputs = processor(images=img, text=q, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        out_ids = model.generate(**inputs, max_new_tokens=5)
    pred = processor.decode(out_ids[0], skip_special_tokens=True).strip().lower()

    predictions.append(pred)
    references.append(a.lower())

n = len(predictions)

In [None]:
# Exact match
exact_matches = [p == r for p, r in zip(predictions, references)]
exact_acc = sum(exact_matches) / n

# Substring match
substr_matches = [(p in r) or (r in p) for p, r in zip(predictions, references)]
substr_acc = sum(substr_matches) / n

# Exact F1
tp = sum(exact_matches)
fp = n - tp
fn = n - tp
exact_prec = tp / (tp + fp) if tp + fp > 0 else 0.0
exact_rec  = tp / (tp + fn) if tp + fn > 0 else 0.0
exact_f1   = 2 * exact_prec * exact_rec / (exact_prec + exact_rec) if (exact_prec + exact_rec) > 0 else 0.0

# Substring F1
tp_s = sum(substr_matches)
fp_s = n - tp_s
fn_s = n - tp_s
substr_prec = tp_s / (tp_s + fp_s) if tp_s + fp_s > 0 else 0.0
substr_rec  = tp_s / (tp_s + fn_s) if tp_s + fn_s > 0 else 0.0
substr_f1   = 2 * substr_prec * substr_rec / (substr_prec + substr_rec) if (substr_prec + substr_rec) > 0 else 0.0

# Token-level macro F1
token_precisions, token_recalls, token_f1s = [], [], []
for p, r in zip(predictions, references):
    ptoks, rtoks = p.split(), r.split()
    if not ptoks or not rtoks:
        token_precisions.append(0.0)
        token_recalls.append(0.0)
        token_f1s.append(0.0)
        continue
    inter = len(set(ptoks) & set(rtoks))
    prec = inter / len(ptoks)
    rec  = inter / len(rtoks)
    f1   = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0.0
    token_precisions.append(prec)
    token_recalls.append(rec)
    token_f1s.append(f1)

macro_token_prec = sum(token_precisions) / n
macro_token_rec  = sum(token_recalls) / n
macro_token_f1   = sum(token_f1s) / n

# Levenshtein similarity
lev_scores = [difflib.SequenceMatcher(None, p, r).ratio() for p, r in zip(predictions, references)]
avg_lev = sum(lev_scores) / n

# BERTScore F1
bs = bertscore.compute(predictions=predictions, references=references, lang="en")
bert_f1 = sum(bs["f1"]) / n

# ROUGE-L F1
rg = rouge.compute(predictions=predictions, references=references)
rouge1 = rg["rouge1"]
rougeL = rg["rougeL"]

# METEOR
meteor_s = meteor.compute(predictions=predictions, references=references)["meteor"]

# BLEU-1 to BLEU-4 via NLTK
refs_for_bleu = [[r.split()] for r in references]
preds_for_bleu = [p.split() for p in predictions]
smooth = SmoothingFunction().method1

bleu1 = corpus_bleu(refs_for_bleu, preds_for_bleu, weights=(1, 0, 0, 0), smoothing_function=smooth)
bleu2 = corpus_bleu(refs_for_bleu, preds_for_bleu, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth)
bleu3 = corpus_bleu(refs_for_bleu, preds_for_bleu, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth)
bleu4 = corpus_bleu(refs_for_bleu, preds_for_bleu, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)

In [None]:
# 8) Print all metrics
print(f"Exact Match Acc         : {exact_acc:.4f}")
print(f"Substring Match Acc     : {substr_acc:.4f}")
print(f"Exact F1                : {exact_f1:.4f}")
print(f"Substring F1            : {substr_f1:.4f}")
print(f"Token-level Macro F1    : {macro_token_f1:.4f}")
print(f"BERTScore F1            : {bert_f1:.4f}")
print(f"ROUGE-1 F1              : {rouge1:.4f}")
print(f"ROUGE-L F1              : {rougeL:.4f}")
print(f"BLEU-1                  : {bleu1:.4f}")
print(f"BLEU-2                  : {bleu2:.4f}")
print(f"BLEU-3                  : {bleu3:.4f}")
print(f"BLEU-4                  : {bleu4:.4f}")
print(f"Levenshtein Similarity  : {avg_lev:.4f}")
print(f"METEOR                  : {meteor_s:.4f}")

In [None]:
# 9) Save predictions
df_sample["prediction"] = predictions
df_sample.to_csv(PRED_CSV, index=False)