In [1]:
!pip install bert-score --no-deps --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from nltk.metrics.distance import edit_distance
from nltk.translate.meteor_score import meteor_score
from nltk.corpus import wordnet as wn
from bert_score import score as bert_score

# helper for WordNet similarity
def best_wn_similarity(w1, w2):
    syns1 = wn.synsets(w1)
    syns2 = wn.synsets(w2)
    maxsim = 0.0
    for s1 in syns1:
        for s2 in syns2:
            sim = s1.wup_similarity(s2) or 0.0
            if sim > maxsim:
                maxsim = sim
    return maxsim

file_paths = [
    '/kaggle/input/rovqa-paligemma-eval/PaliGemma-Finetuned.csv',
    '/kaggle/input/rovqa-blip-eval/Blip-Finetuned.csv'
]

for path in file_paths:
    df = pd.read_csv(path).dropna(subset=['ground_truth','prediction'])
    GT = df['ground_truth'].str.lower().tolist()
    Y  = df['prediction'].str.lower().tolist()

    # Exact‐Match Accuracy
    acc = accuracy_score(GT, Y)

    # Macro F1
    prec, rec, f1_macro, _ = precision_recall_fscore_support(
        GT, Y, average='macro', zero_division=0
    )
    # WordNet Wu‐Palmer
    wn_sims = [best_wn_similarity(y, g) for y, g in zip(Y, GT)]
    avg_wn = np.mean(wn_sims)

    # BERTScore
    P, R, F1 = bert_score(
        cands=Y, refs=GT, lang="en", rescale_with_baseline=True
    )

    model_name = path.split('/')[-1]
    print(f"\n=== {model_name} ===")
    print(f"Exact‐Match Accuracy    : {acc:.3f}")
    print(f"Macro F1    : {f1_macro:.3f}")
    print(f"Wu–Palmer Sim      : {avg_wn:.3f}")
    print(f"BERT F1    : {F1.mean():.3f}")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

2025-05-12 09:19:04.865314: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747041545.060024      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747041545.125267      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== PaliGemma-Finetuned.csv ===
Exact‐Match Accuracy    : 0.741
Macro F1    : 0.469
Wu–Palmer Sim      : 0.905
BERT F1    : 0.943


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Blip-Finetuned.csv ===
Exact‐Match Accuracy    : 0.456
Macro F1    : 0.126
Wu–Palmer Sim      : 0.786
BERT F1    : 0.879
