In [None]:
from transformers import ViltProcessor, ViltForQuestionAnswering
from PIL import Image
from compute_score import compute_score
from tqdm import tqdm
import os
import json
import pandas as pd
import torch

# (GPU hoặc CPU) : Note -> GPU cho nhanh! CPU ~ 1 tiếng hơn á :v
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

data_path = "/home/tgng/coding/OpenViVQA/data/text/evaluate_60k_data_balanced_preprocessed.csv"
image_dir = "/home/tgng/coding/OpenViVQA/data/images/"
data = pd.read_csv(data_path)

# Load model
model_dir = "/home/tgng/coding/OpenViVQA/finetuned_vilt_best/"
processor = ViltProcessor.from_pretrained(model_dir)
model = ViltForQuestionAnswering.from_pretrained(model_dir).to(device)  # Đưa mô hình lên GPU nếu khả dụng

# Predict results
predictions = []
ground_truths = []
max_length = 40  # Giới hạn độ dài tối đa của chuỗi đầu vào (chủ yếu để nhanh thôi - mà dài quá thì code của thầy này (repo này) cx có giới hạn số token)

In [None]:
import matplotlib.pyplot as plt
from PIL import Image

# Get sample
sample_idx = 0
row = data.iloc[sample_idx]

image_name = f"{int(row['image_id']):012d}.jpg"
image_path = os.path.join(image_dir, image_name)

# Load image & text
image = Image.open(image_path).convert("RGB")
question = row["question"]
answers = eval(row["answers"])  # Convert string to list -> ["answer1", "answer2", ...]

# Model input
inputs = processor(images=image, text=question, return_tensors="pt").to(device)
outputs = model(**inputs)
predicted_answer = processor.decode(outputs.logits.argmax(dim=-1))

plt.figure(figsize=(8, 8))
plt.imshow(image)
plt.axis("off")
plt.title(f"Question: {question}\nPredicted Answer: {predicted_answer}\nGround Truth: {', '.join(answers)}")
plt.show()

In [None]:
for _, row in tqdm(data.iterrows(), total=len(data), desc="Processing", unit="sample"):
    image_name = f"{int(row['image_id']):012d}.jpg"
    image_path = os.path.join(image_dir, image_name)
    
    image = Image.open(image_path).convert("RGB")
    question = row["question"]
    answers = eval(row["answers"])

    # do code của repo này có giới hạn số token, nên mình phải truncate câu hỏi trước nha ae
    inputs = processor.tokenizer(question, return_tensors="pt", truncation=True, max_length=max_length)
    truncated_question = processor.tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)

    inputs = processor(images=image, text=truncated_question, return_tensors="pt").to(device)  # Đưa input lên GPU nếu khả dụng nha
    outputs = model(**inputs)
    predicted_answer = processor.decode(outputs.logits.argmax(dim=-1))

    predictions.append(predicted_answer)
    ground_truths.append(answers)

In [None]:
import numpy as np

In [None]:
gts = {str(idx): gt for idx, gt in enumerate(ground_truths)}  # Chuyển ground_truths thành dictionary :v
gen = {str(idx): [pred] for idx, pred in enumerate(predictions)}  # Chuyển predictions thành dictionary

scores = compute_score(gts, gen)
print("Evaluation Scores:", scores)

def convert_to_serializable(obj):
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj  # Các kiểu dữ liệu khác giữ nguyên

output_path = "/home/tgng/coding/OpenViVQA/evaluation_results.json"
with open(output_path, "w") as f:
    json.dump(scores, f, ensure_ascii=False, indent=4, default=convert_to_serializable)