In [1]:
from transformers import ViltProcessor, ViltForQuestionAnswering
import requests
from PIL import Image
import json
import torch
import numpy as np
from num2words import num2words
from tqdm import tqdm
from ofa.ofa_infer import OFAInference
from evaluate_metrics import compute_acc
from lavis.models import load_model_and_preprocess

## ViLT

In [2]:
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

def infer_vilt(image_path, question):
    image = Image.open(image_path)
    # prepare inputs
    encoding = processor(image, question, return_tensors="pt")

    # forward pass
    outputs = model(**encoding)
    logits = outputs.logits
    idx = logits.argmax(-1).item()
    
    answer = model.config.id2label[idx]
    
    split_answer = answer.split()
    answ = []
    for ans in split_answer:
        try:
            answ.append(num2words(ans).replace('-', ' '))
        except:
            answ.append(ans)
    
    return ' '.join(answ)

## OFA 

In [2]:
ofa_base = OFAInference()

def infer_ofa_base(image_path, question):
    answer = ofa_base.ofa_inference(image_path, question)
    split_ans = answer.split()
    ans = []
    for w in split_ans:
        try:
            ans.append(num2words(w))
        except:
            ans.append(w)
    return ' '.join(ans)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
  "Argument interpolation should be of type InterpolationMode instead of int. "


In [2]:
# ofa = OFAInference()

ofa = OFAInference(pretrained_path='models/vqa_large_best.pt')

def infer_ofa(image_path, question):
    answer = ofa.ofa_inference(image_path, question)
    split_ans = answer.split()
    ans = []
    for w in split_ans:
        try:
            ans.append(num2words(w))
        except:
            ans.append(w)
    return ' '.join(ans)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
  "Argument interpolation should be of type InterpolationMode instead of int. "


## LAVIS

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model, vis_processors, txt_processors = load_model_and_preprocess(
    name="blip_vqa", model_type="vqav2", is_eval=True, device=device)

def infer_lavis(image_path, question):
    raw_image = Image.open(image_path).convert("RGB")
    image = vis_processors["eval"](raw_image).unsqueeze(0).to(device)
    question = txt_processors["eval"](question)
    answer = model.predict_answers(
        samples={"image": image, "text_input": question}, inference_method="generate")
    
    split_ans = answer[0].split()
    ans = []
    for w in split_ans:
        try:
            ans.append(num2words(w))
        except:
            ans.append(w)
    return ' '.join(ans)

In [3]:
with open('data/train/evjvqa_train_lang_qtype-detailed.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)
    
annotations = train_data['annotations']

len(annotations)

23785

In [4]:
gold_answers = []
ofa_base_answers = []
ofa_answers = []
vilt_answers = []
lavis_answers = []


for anno in tqdm(annotations):
    if anno['question_type'] in ['WHAT_DO'] and anno['language'] == 'en':
        vilt_answers.append(infer_vilt(anno['img_path'], anno['question']))
#         ofa_base_answers.append(infer_ofa_base(anno['img_path'], anno['question']))
#         ofa_answers.append(infer_ofa(anno['img_path'], anno['question']))
#         lavis_answers.append(infer_lavis(anno['img_path'], anno['question']))
        gold_answers.append(anno['answer'])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23785/23785 [01:34<00:00, 250.94it/s]


In [5]:
gold_dict = {}
ofa_base_dict = {}
ofa_dict = {}
vilt_dict = {}
lavis_dict = {}

In [6]:
i = 0

for anno in tqdm(annotations):
    if anno['question_type'] in ['WHAT_DO'] and anno['language'] == 'en':
        idx = annotations[i]['id']
        gold_dict[idx] = gold_answers[i]
#         ofa_base_dict[idx] = ofa_base_answers[i]
#         ofa_dict[idx] = ofa_answers[i]
        vilt_dict[idx] = vilt_answers[i]
#         lavis_dict[idx] = lavis_answers[i]
        
        i += 1
        
i

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23785/23785 [00:00<00:00, 4574118.32it/s]


1030

In [10]:
ofa_base_score = 0
ofa_score = 0
vilt_score = 0
lavis_score = 0

for i in range(len(gold_answers)):
#     try:
#     if ofa_answers[i] in gold_answers[i]:
#         ofa_score += 1
    
#     if ofa_base_answers[i] in gold_answers[i]:
#         ofa_base_score += 1
    
    if vilt_answers[i] in gold_answers[i]:
        vilt_score += 1
        
#     if lavis_answers[i] in gold_answers[i]:
#         lavis_score += 1
        
# (ofa_score, ofa_base_score, vilt_score), (ofa_score / i, ofa_base_score/i, vilt_score/i)

print(
    f"vilt_score: {vilt_score}/{i}~={vilt_score/i}\n"
#     f"ofa_score: {ofa_score}/{i}~={ofa_score/i}\n"
#     f"ofa_base_score: {ofa_base_score}/{i}~={ofa_base_score/i}\n"
    f"lavis_score {lavis_score}/{i}~={lavis_score/i}"
)

vilt_score: 127/1029~=0.12342079689018465
lavis_score 0/1029~=0.0


In [63]:
compute_acc(a_gold=gold_dict, a_pred=ofa_dict)

In [36]:
compute_acc(a_gold=gold_dict, a_pred=ofa_base_dict)

0.19624572550596328

In [84]:
compute_acc(a_gold=gold_dict, a_pred=vilt_dict)

0.04639319261219669

In [90]:
compute_acc(a_gold=gold_dict, a_pred=lavis_dict)

0.1791443713339354