In [1]:
from transformers import ViltProcessor, ViltForQuestionAnswering
import requests
from PIL import Image
import json
import torch
import numpy as np
from num2words import num2words
from tqdm.notebook import tqdm
from ofa.ofa_infer import OFAInference
from evaluate_metrics import compute_f1
from lavis.models import load_model_and_preprocess

## ViLT

In [6]:
WHICHprocessor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

def infer_vilt(image_path, question):
    image = Image.open(image_path)
    # prepare inputs
    encoding = processor(image, question, return_tensors="pt")

    # forward pass
    outputs = model(**encoding)
    logits = outputs.logits
    idx = logits.argmax(-1).item()
    
    answer = model.config.id2label[idx]
    
    split_answer = answer.split()
    answ = []
    for ans in split_answer:
        try:
            answ.append(num2words(ans).replace('-', ' '))
        except:
            answ.append(ans)
    
    return ' '.join(answ)

## OFA 

In [2]:
# ofa = OFAInference()

ofa = OFAInference(pretrained_path='models/vqa_huge_best.pt')

def infer_ofa(image_path, question):
    answer = ofa.ofa_inference(image_path, question)
    split_ans = answer.split()
    ans = []
    for w in split_ans:
        try:
            ans.append(num2words(w))
        except:
            ans.append(w)
    return ' '.join(ans)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
  "Argument interpolation should be of type InterpolationMode instead of int. "


## LAVIS

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model, vis_processors, txt_processors = load_model_and_preprocess(
    name="blip_vqa", model_type="aokvqa", is_eval=True, device=device)

def infer_lavis(image_path, question):
    raw_image = Image.open(image_path).convert("RGB")
    image = vis_processors["eval"](raw_image).unsqueeze(0).to(device)
    question = txt_processors["eval"](question)
    answer = model.predict_answers(
        samples={"image": image, "text_input": question}, inference_method="generate")
    
    split_ans = answer[0].split()
    ans = []
    for w in split_ans:
        try:
            ans.append(num2words(w))
        except:
            ans.append(w)
    return ' '.join(ans)

In [3]:
with open('data/test/evjvqa_public_test-lang-qtype-answer.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)
    
annotations = test_data['annotations']

len(annotations)

5015

In [5]:
gold_answers = []
ofa_answers = []
vilt_answers = []
lavis_answers = []


for anno in tqdm(annotations):
    if anno['question_type'] in ['WHAT_COLOR'] and anno['language'] == 'en':
#     if anno['language'] == 'en':
#         vilt_answers.append(infer_vilt(anno['img_path'], anno['question']))
#         ofa_answers.append(infer_ofa(anno['img_path'], anno['question']))
        lavis_answers.append(infer_lavis(anno['img_path'], anno['question']))
        gold_answers.append(anno['answer'])

  0%|          | 0/5015 [00:00<?, ?it/s]

In [8]:
i = 0
gold_dict = {}
ofa_dict = {}
vilt_dict = {}
lavis_dict = {}

for j, anno in tqdm(enumerate(annotations)):
    if anno['question_type'] in ['WHAT_COLOR'] and anno['language'] == 'en':
#     if anno['language'] == 'en':
        idx = annotations[j]['id']
        gold_dict[idx] = gold_answers[i]
#         ofa_dict[idx] = ofa_answers[i]
#         vilt_dict[idx] = vilt_answers[i]
        lavis_dict[idx] = lavis_answers[i]
        
        i += 1
        
i

0it [00:00, ?it/s]

212

In [10]:
with open('./outputs/results-lavis-best-en.json', 'w', encoding='utf-8') as f:
    json.dump(lavis_dict, f, indent=4, ensure_ascii=False)
    
# with open('./outputs/results-lavis-best-en.json', 'r', encoding='utf-8') as f:
#     ofa_dict = json.load(f)

In [14]:
tmp_lavis_dict = {}
for k, v in lavis_dict.items():
    tmp_ofa_dict[int(k)] = v
    
lavis_dict = tmp_lavis_dict

In [16]:
ofa_score = 0
vilt_score = 0
lavis_score = 0

for i in range(len(gold_answers)):
    
#     if ofa_answers[i] in gold_answers[i]:
#         ofa_score += 1
    
#     if vilt_answers[i] in gold_answers[i]:
#         vilt_score += 1
        
    if lavis_answers[i] in gold_answers[i]:
        lavis_score += 1
        
# (ofa_score, ofa_base_score, vilt_score), (ofa_score / i, ofa_base_score/i, vilt_score/i)

print(
#     f"vilt_score: {vilt_score}/{len(gold_answers)}~={vilt_score/len(gold_answers) * 100}\n"
#     f"ofa_score: {ofa_score}/{len(gold_answers)}~={ofa_score/ len(gold_answers) * 100}\n"
    f"lavis_score {lavis_score}/{len(gold_answers)}~={lavis_score/len(gold_answers) * 100}"
)

print(f"OFA: {compute_f1(a_gold=gold_dict, a_pred=ofa_dict)}")

# print(f"ViLT: {compute_f1(a_gold=gold_dict, a_pred=vilt_dict)}")

# print(f"BLIP: {compute_f1(a_gold=gold_dict, a_pred=lavis_dict)}")

lavis_score 152/212~=71.69811320754717
OFA: 0.47023037235301385


In [None]:
OFA: 0.26887240779732646
