In [1]:
!git clone https://github.com/miohana/vqa-llm-framework.git

Cloning into 'vqa-llm-framework'...
remote: Enumerating objects: 100, done.[K
remote: Counting objects: 100% (100/100), done.[K
remote: Compressing objects: 100% (63/63), done.[K
remote: Total 100 (delta 30), reused 84 (delta 23), pack-reused 0 (from 0)[K
Receiving objects: 100% (100/100), 2.02 MiB | 31.84 MiB/s, done.
Resolving deltas: 100% (30/30), done.


In [2]:
!pip install langchain openai langchain_openai datasets evaluate transformers -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/50.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/389.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m389.6/389.6 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
from google.colab import userdata

In [4]:
import os
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [5]:
import pandas as pd
from datasets import load_dataset

class DatasetLoader:
    def __init__(self, eval_questions_path, vqa_eval_path):
        self.eval_questions_path = eval_questions_path
        self.vqa_eval_path = vqa_eval_path

    def load_data(self):
        self.coco_eval_questions = pd.read_json(self.eval_questions_path).head(5)
        self.vqa_eval = pd.read_json(self.vqa_eval_path).head(5)

        return self.coco_eval_questions, self.vqa_eval

In [6]:
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
from PIL import Image
import requests
import torch

class ResponseGenerator:
    def __init__(self, model_id="google/paligemma-3b-mix-224"):
        self.model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval()
        self.processor = AutoProcessor.from_pretrained(model_id)

    def generate_response(self, question: str, image_url: str):
        image = Image.open(requests.get(image_url, stream=True).raw)
        model_inputs = self.processor(text=question, images=image, return_tensors="pt")
        input_len = model_inputs["input_ids"].shape[-1]

        with torch.inference_mode():
            generation = self.model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
            generation = generation[0][input_len:]
            response = self.processor.decode(generation, skip_special_tokens=True)
        return response

In [7]:
from evaluate import load
from langchain_openai import OpenAIEmbeddings
from scipy.spatial.distance import cosine
import numpy as np

class MetricEvaluator:
    def __init__(self):
        self.squad_metric = load("squad")
        self.embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002")

    def compute_standard_metrics(self, references, predictions):
        results = self.squad_metric.compute(predictions=predictions, references=references)
        f1_score = results["f1"]
        exact_match = results["exact_match"]
        return {"f1_score": f1_score, "exact_match": exact_match}

    def compute_semantic_metrics(self, references, predictions):
        similarities = []
        for ref, pred in zip(references, predictions):
            ref_embedding = self.embeddings_model.embed_query(ref["answers"][0]["text"])
            pred_embedding = self.embeddings_model.embed_query(pred["prediction_text"])
            similarity = 1 - cosine(ref_embedding, pred_embedding)
            similarities.append(similarity)
        avg_similarity = np.mean(similarities)
        return {"semantic_similarity": avg_similarity}

    def evaluate(self, references, predictions):
        standard_metrics = self.compute_standard_metrics(references, predictions)
        #semantic_metrics = self.compute_semantic_metrics(references, predictions)
        #return {"standard_metrics": standard_metrics, "semantic_metrics": semantic_metrics}
        return {"standard_metrics": standard_metrics}


In [8]:
import json
import pandas as pd

class ResultsReporter:
    def __init__(self, results_data, output_file="vqa_results.json"):
        self.results_data = results_data
        self.output_file = output_file

    def save_as_json(self):
        with open(self.output_file, "w") as json_file:
            json.dump(self.results_data, json_file, indent=4)
        print(f"Results saved as JSON in {self.output_file}")

    def save_as_dataframe(self):
        df = pd.DataFrame(self.results_data)
        return df

    def display_results(self):
        df = self.save_as_dataframe()
        print("Results Summary:")
        print(df.head())

In [9]:
def main(eval_questions_path, vqa_eval_path):
    loader = DatasetLoader(eval_questions_path, vqa_eval_path)
    coco_eval_questions, vqa_eval = loader.load_data()

    response_generator = ResponseGenerator(model_id="google/paligemma-3b-mix-224")

    predictions = []
    references = []
    results_data = []

    for index, row in vqa_eval.iterrows():
        question = row["question"]
        reference_answer = row["multiple_choice_answer"]
        image_url = row["url"]

        response = response_generator.generate_response(question, image_url)

        predictions.append({"id": str(row["id"]), "prediction_text": response})
        references.append({
            "id": str(row["id"]),
            "answers": [{"text": reference_answer, "answer_start": 0}]
        })

    evaluator = MetricEvaluator()
    metrics = evaluator.evaluate(references, predictions)

    for idx, pred in enumerate(predictions):
        result_row = {
            "id": pred["id"],
            "model_answer": pred["prediction_text"],
            "model_name": "pali_gemma",
            "f1": metrics["standard_metrics"]["f1_score"],
            "accuracy": metrics["standard_metrics"]["exact_match"]
            #"metric_a": metrics["semantic_metrics"]["semantic_similarity"],
            #"metric_b": metrics["semantic_metrics"]["semantic_similarity"]
        }
        results_data.append(result_row)

    reporter = ResultsReporter(results_data)
    reporter.save_as_json()
    reporter.display_results()

In [10]:
eval_questions_path = "/content/vqa-llm-framework/data/coco-eval-questions.json"
vqa_eval_path = "/content/vqa-llm-framework/data/vqa-eval.json"


main(eval_questions_path, vqa_eval_path)

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/62.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/699 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/40.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.26M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/607 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

Results saved as JSON in vqa_results.json
Results Summary:
       id model_answer  model_name    f1  accuracy
0  293832            6  pali_gemma  80.0      80.0
1  129592          yes  pali_gemma  80.0      80.0
2   13729        brown  pali_gemma  80.0      80.0
3  379086          yes  pali_gemma  80.0      80.0
4   96618          yes  pali_gemma  80.0      80.0


10/11/2024

In [None]:
# Set your Hugging Face token securely
import os
os.environ["HF_TOKEN"] = ""

In [10]:
import json
import os
import requests
from PIL import Image
import torch
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration

class ResponseGenerator:
    def __init__(self, model_id="google/paligemma-3b-mix-224"):
        self.token = os.environ.get("HF_TOKEN")
        if not self.token:
            raise ValueError("Hugging Face token (HF_TOKEN) not set in environment variables.")

        try:
            print("Loading model and processor...")
            self.model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, use_auth_token=self.token).eval()
            self.processor = AutoProcessor.from_pretrained(model_id, use_auth_token=self.token)
            print("Model and processor loaded successfully.")
        except Exception as e:
            raise ValueError(f"Error loading model or processor: {e}")

    def fetch_image(self, image_url: str) -> Image.Image:
        try:
            response = requests.get(image_url, stream=True, timeout=10)
            response.raise_for_status()
            return Image.open(response.raw)
        except requests.exceptions.RequestException as e:
            raise ValueError(f"Error fetching image: {e}")

    def generate_response(self, question: str, image_url: str) -> str:
        try:
            image = self.fetch_image(image_url)
            model_inputs = self.processor(text=question, images=image, return_tensors="pt")
            input_len = model_inputs["input_ids"].shape[-1]

            with torch.inference_mode():
                generation = self.model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
                generation = generation[0][input_len:]
                response = self.processor.decode(generation, skip_special_tokens=True)
            return response
        except Exception as e:
            print(f"Error generating response: {e}")
            return "An error occurred while generating the response."

    def generate_batch_responses(self, data: list) -> list:
        responses = []
        for entry in data:
            question = entry["question"]
            image_url = entry["url"]
            response = self.generate_response(question, image_url)
            responses.append({
                "id": entry["id"],
                "question": question,
                "url": image_url,
                "model_answer": response,
                "model_name": "paligemma"
            })
        return responses


def main(dataset_type: str):
    if dataset_type == "vqa":
        input_path = "/content/vqa-llm-framework/0_source_datasets/vqa_questions.json"
        output_path = "/content/vqa-llm-framework/2_answered_question_datasets/vqa_answered.json"
    elif dataset_type == "coco":
        input_path = "/content/vqa-llm-framework/0_source_datasets/coco_questions.json"
        output_path = "/content/vqa-llm-framework/2_answered_question_datasets/coco_answered.json"
    else:
        raise ValueError("Invalid dataset type. Use 'vqa' or 'coco'.")

    with open(input_path, 'r') as file:
        input_data = json.load(file)

    generator = ResponseGenerator()

    responses = generator.generate_batch_responses(input_data)

    with open(output_path, 'w') as outfile:
        json.dump(responses, outfile, indent=4)
    print(f"Responses saved to {output_path}")

In [11]:
# Run the main function with the desired dataset type
dataset_type = "coco"  # Change to "coco" if needed
main(dataset_type)

Loading model and processor...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model and processor loaded successfully.
Error generating response: Unsupported number of image dimensions: 2
Responses saved to /content/vqa-llm-framework/2_answered_question_datasets/coco_answered.json


In [8]:
import json

def add_model_name_column(input_path: str, output_path: str):
    """
    Add a column 'model_name' with value 'paligemma' to each entry in the dataset.

    Args:
        input_path (str): Path to the input JSON file.
        output_path (str): Path to save the updated JSON file.
    """
    # Load the existing dataset
    with open(input_path, 'r') as file:
        data = json.load(file)

    # Add the "model_name" column to each entry
    for entry in data:
        entry["model_name"] = "paligemma"

    # Save the updated dataset
    with open(output_path, 'w') as outfile:
        json.dump(data, outfile, indent=4)

    print(f"Updated dataset saved to {output_path}")

# Example usage
input_path = "/content/vqa-llm-framework/2_answered_question_datasets/vqa_answered.json"
output_path = "/content/vqa-llm-framework/2_answered_question_datasets/vqa_answered_with_model_name.json"

add_model_name_column(input_path, output_path)


Updated dataset saved to /content/vqa-llm-framework/2_answered_question_datasets/vqa_answered_with_model_name.json


In [None]:
import json
import os
import requests
from PIL import Image
import torch
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration

class ResponseEvaluator:
    def __init__(self, model_id="google/paligemma-3b-mix-224"):
        self.token = os.environ.get("HF_TOKEN")
        if not self.token:
            raise ValueError("Hugging Face token (HF_TOKEN) not set in environment variables.")

        print("Loading PaliGemma model and processor...")
        self.model = PaliGemmaForConditionalGeneration.from_pretrained(
            model_id, use_auth_token=self.token
        ).eval()
        self.processor = AutoProcessor.from_pretrained(
            model_id, use_auth_token=self.token
        )
        print("Model loaded successfully.")

    def fetch_image(self, image_url: str) -> Image.Image:
        """Fetch image from the provided URL."""
        try:
            response = requests.get(image_url, stream=True, timeout=10)
            response.raise_for_status()
            return Image.open(response.raw)
        except requests.exceptions.RequestException as e:
            raise ValueError(f"Error fetching image: {e}")

    def generate_evaluation(self, question: str, image_url: str, model_answer: str, evaluation_type: str) -> int:
        """
        Generate an evaluation score for faithfulness or relevancy.
        The model should respond with '1' (supported) or '0' (not supported).
        """
        try:
            image = self.fetch_image(image_url)
            prompt = (
                f"Evaluate the {evaluation_type} of the given answer based on the image and question. "
                f"Question: {question}. Model Answer: {model_answer}. "
                "Respond with 1 if the answer is supported, otherwise respond with 0."
            )

            model_inputs = self.processor(text=prompt, images=image, return_tensors="pt")
            input_len = model_inputs["input_ids"].shape[-1]

            with torch.inference_mode():
                generation = self.model.generate(**model_inputs, max_new_tokens=10, do_sample=False)
                generation = generation[0][input_len:]
                response = self.processor.decode(generation, skip_special_tokens=True).strip()

            # Ensure response is either '0' or '1'
            return int(response) if response in ["0", "1"] else 0
        except Exception as e:
            print(f"Error during evaluation: {e}")
            return 0

    def assess_batch(self, data: list) -> list:
        """Evaluate a batch of questions and answers."""
        results = []
        for entry in data:
            question = entry["question"]
            image_url = entry["url"]
            model_answer = entry["model_answer"]

            # Evaluate faithfulness
            faithfulness = self.generate_evaluation(
                question, image_url, model_answer, "faithfulness"
            )

            # Evaluate relevancy
            relevancy = self.generate_evaluation(
                question, image_url, model_answer, "relevancy"
            )

            results.append({
                "id": entry["id"],
                "model_name": entry.get("model_name", "unknown"),
                "question": question,
                "url": image_url,
                "model_answer": model_answer,
                "faithfulness": faithfulness,
                "relevancy": relevancy
            })
        return results


def main(dataset_type: str):
    # Define paths based on the dataset type
    if dataset_type == "vqa":
        input_path = "/content/vqa-llm-framework/2_answered_question_datasets/vqa_answered.json"
        output_path = "/content/vqa-llm-framework/3_assessment_results_datasets/vqa_assessment_results.json"
    elif dataset_type == "coco":
        input_path = "/content/vqa-llm-framework/2_answered_question_datasets/coco_answered.json"
        output_path = "/content/vqa-llm-framework/3_assessment_results_datasets/coco_assessment_results.json"
    else:
        raise ValueError("Invalid dataset type. Use 'vqa' or 'coco'.")

    # Load input data
    with open(input_path, 'r') as file:
        input_data = json.load(file)

    # Instantiate the evaluator and assess the dataset
    evaluator = ResponseEvaluator()
    assessment_results = evaluator.assess_batch(input_data)

    # Save results to a new JSON file
    with open(output_path, 'w') as outfile:
        json.dump(assessment_results, outfile, indent=4)
    print(f"Assessment results saved to {output_path}")


# Google Colab - Prompt user to select dataset type
dataset_type = input("Enter dataset type (vqa or coco): ").strip().lower()

# Run the main function
main(dataset_type)

Enter dataset type (vqa or coco): coco
Loading PaliGemma model and processor...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model loaded successfully.
