In [1]:
!git clone https://github.com/miohana/vqa-llm-framework.git

fatal: destination path 'vqa-llm-framework' already exists and is not an empty directory.


In [None]:
!pip install langchain openai langchain_openai datasets evaluate transformers -q

In [18]:
from google.colab import userdata

In [19]:
import os
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [40]:
import pandas as pd
from datasets import load_dataset

class DatasetLoader:
    def __init__(self, eval_questions_path, vqa_eval_path):
        self.eval_questions_path = eval_questions_path
        self.vqa_eval_path = vqa_eval_path

    def load_data(self):
        self.coco_eval_questions = pd.read_json(self.eval_questions_path).head(5)
        self.vqa_eval = pd.read_json(self.vqa_eval_path).head(5)

        return self.coco_eval_questions, self.vqa_eval

In [38]:
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
from PIL import Image
import requests
import torch

class ResponseGenerator:
    def __init__(self, model_id="google/paligemma-3b-mix-224"):
        self.model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval()
        self.processor = AutoProcessor.from_pretrained(model_id)

    def generate_response(self, question: str, image_url: str):
        image = Image.open(requests.get(image_url, stream=True).raw)
        model_inputs = self.processor(text=question, images=image, return_tensors="pt")
        input_len = model_inputs["input_ids"].shape[-1]

        with torch.inference_mode():
            generation = self.model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
            generation = generation[0][input_len:]
            response = self.processor.decode(generation, skip_special_tokens=True)
        return response

In [53]:
from evaluate import load
from langchain_openai import OpenAIEmbeddings
from scipy.spatial.distance import cosine
import numpy as np

class MetricEvaluator:
    def __init__(self):
        self.squad_metric = load("squad")
        self.embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002")

    def compute_standard_metrics(self, references, predictions):
        results = self.squad_metric.compute(predictions=predictions, references=references)
        f1_score = results["f1"]
        exact_match = results["exact_match"]
        return {"f1_score": f1_score, "exact_match": exact_match}

    def compute_semantic_metrics(self, references, predictions):
        similarities = []
        for ref, pred in zip(references, predictions):
            ref_embedding = self.embeddings_model.embed_query(ref["answers"][0]["text"])
            pred_embedding = self.embeddings_model.embed_query(pred["prediction_text"])
            similarity = 1 - cosine(ref_embedding, pred_embedding)
            similarities.append(similarity)
        avg_similarity = np.mean(similarities)
        return {"semantic_similarity": avg_similarity}

    def evaluate(self, references, predictions):
        standard_metrics = self.compute_standard_metrics(references, predictions)
        #semantic_metrics = self.compute_semantic_metrics(references, predictions)
        #return {"standard_metrics": standard_metrics, "semantic_metrics": semantic_metrics}
        return {"standard_metrics": standard_metrics}


In [62]:
import json
import pandas as pd

class ResultsReporter:
    def __init__(self, results_data, output_file="vqa_results.json"):
        self.results_data = results_data
        self.output_file = output_file

    def save_as_json(self):
        with open(self.output_file, "w") as json_file:
            json.dump(self.results_data, json_file, indent=4)
        print(f"Results saved as JSON in {self.output_file}")

    def save_as_dataframe(self):
        df = pd.DataFrame(self.results_data)
        return df

    def display_results(self):
        df = self.save_as_dataframe()
        print("Results Summary:")
        print(df.head())

In [66]:
def main(eval_questions_path, vqa_eval_path):
    loader = DatasetLoader(eval_questions_path, vqa_eval_path)
    coco_eval_questions, vqa_eval = loader.load_data()

    response_generator = ResponseGenerator(model_id="google/paligemma-3b-mix-224")

    predictions = []
    references = []
    results_data = []

    for index, row in vqa_eval.iterrows():
        question = row["question"]
        reference_answer = row["multiple_choice_answer"]
        image_url = row["url"]

        response = response_generator.generate_response(question, image_url)

        predictions.append({"id": str(row["id"]), "prediction_text": response})
        references.append({
            "id": str(row["id"]),
            "answers": [{"text": reference_answer, "answer_start": 0}]
        })

    evaluator = MetricEvaluator()
    metrics = evaluator.evaluate(references, predictions)

    for idx, pred in enumerate(predictions):
        result_row = {
            "id": pred["id"],
            "model_answer": pred["prediction_text"],
            "model_name": "pali_gemma",
            "f1": metrics["standard_metrics"]["f1_score"],
            "accuracy": metrics["standard_metrics"]["exact_match"]
            #"metric_a": metrics["semantic_metrics"]["semantic_similarity"],
            #"metric_b": metrics["semantic_metrics"]["semantic_similarity"]
        }
        results_data.append(result_row)

    reporter = ResultsReporter(results_data)
    reporter.save_as_json()
    reporter.display_results()

In [67]:
eval_questions_path = "/content/vqa-llm-framework/data/coco-eval-questions.json"
vqa_eval_path = "/content/vqa-llm-framework/data/vqa-eval.json"


main(eval_questions_path, vqa_eval_path)



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

6
yes
brown
yes
yes
[{'id': '293832', 'answers': [{'text': '5', 'answer_start': 0}]}, {'id': '129592', 'answers': [{'text': 'yes', 'answer_start': 0}]}, {'id': '13729', 'answers': [{'text': 'brown', 'answer_start': 0}]}, {'id': '379086', 'answers': [{'text': 'yes', 'answer_start': 0}]}, {'id': '96618', 'answers': [{'text': 'yes', 'answer_start': 0}]}]
[{'id': '293832', 'prediction_text': '6'}, {'id': '129592', 'prediction_text': 'yes'}, {'id': '13729', 'prediction_text': 'brown'}, {'id': '379086', 'prediction_text': 'yes'}, {'id': '96618', 'prediction_text': 'yes'}]
Results saved as JSON in vqa_results.json
Results Summary:
       id model_answer  model_name    f1  accuracy
0  293832            6  pali_gemma  80.0      80.0
1  129592          yes  pali_gemma  80.0      80.0
2   13729        brown  pali_gemma  80.0      80.0
3  379086          yes  pali_gemma  80.0      80.0
4   96618          yes  pali_gemma  80.0      80.0
