## **Evaluating BLIP without fine-tuning on Rad Vqa**

In [None]:
!pip install datasets transformers torch torchvision tqdm

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image
import pandas as pd
import csv
import os
import torch
from tqdm import tqdm
from datasets import load_dataset
import io

# Initialize processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

# Move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load dataset
dataset = load_dataset("flaviagiammarino/vqa-rad")

# Convert the test dataset to pandas DataFrame and filter yes/no answers
test_df = dataset['test'].to_pandas()
test_df = test_df[test_df['answer'].str.lower().isin(['yes', 'no'])].reset_index(drop=True)

# Create a list to store the results
results = []

# Iterate through each row in the test DataFrame
for idx, row in tqdm(test_df.iterrows(), total=test_df.shape[0], desc="Processing"):
    question = row['question']
    image_bytes = row['image']['bytes']
    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
    actual_answer = row['answer']

    # Prepare inputs
    encoding = processor(images=image, text=question, return_tensors="pt").to(device)

    # Perform inference
    out = model.generate(**encoding)
    generated_text = processor.decode(out[0], skip_special_tokens=True)

    # Append results using the index as the identifier
    results.append((idx, actual_answer, generated_text))

# Write the results to a CSV file
csv_file_path = "results/results_no_finetuning.csv"
os.makedirs(os.path.dirname(csv_file_path), exist_ok=True)
with open(csv_file_path, mode="w", newline="") as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(["ID", "Actual Answer", "Generated Answer"])
    csv_writer.writerows(results)

print(f"Results saved to {csv_file_path}")

README.md:   0%|          | 0.00/3.91k [00:00<?, ?B/s]

(…)-00000-of-00001-eb8844602202be60.parquet:   0%|          | 0.00/24.2M [00:00<?, ?B/s]

(…)-00000-of-00001-e5bc3d208bb4deeb.parquet:   0%|          | 0.00/10.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1793 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/451 [00:00<?, ? examples/s]

Processing: 100%|██████████| 251/251 [00:28<00:00,  8.92it/s]

Results saved to results/results_no_finetuning.csv





In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

def calculate_metrics(csv_file_path):
    # Load results from the CSV file
    df = pd.read_csv(csv_file_path)

    # Extract actual and generated answers
    actual_answers = df["Actual Answer"].str.lower()
    generated_answers = df["Generated Answer"].str.lower()

    # Ensure answers are binary (yes/no)
    valid_answers = ["yes", "no"]
    actual_answers = actual_answers[actual_answers.isin(valid_answers)]
    generated_answers = generated_answers[generated_answers.isin(valid_answers)]

    # Filter rows where both answers are valid
    valid_indices = actual_answers.index.intersection(generated_answers.index)
    actual_answers = actual_answers.loc[valid_indices]
    generated_answers = generated_answers.loc[valid_indices]

    # Calculate metrics
    accuracy = accuracy_score(actual_answers, generated_answers)
    precision = precision_score(actual_answers, generated_answers, pos_label="yes", average="binary")
    recall = recall_score(actual_answers, generated_answers, pos_label="yes", average="binary")
    f1 = f1_score(actual_answers, generated_answers, pos_label="yes", average="binary")

    return accuracy, precision, recall, f1

# Call the function to calculate metrics
csv_file_path = "results/results_no_finetuning.csv"
accuracy, precision, recall, f1 = calculate_metrics(csv_file_path)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.5162
Precision: 0.5344
Recall: 0.8136
F1 Score: 0.6451
