In [None]:
import Levenshtein
import pandas as pd
from statistics import mean
from bert_score import score as bert_score
from nltk.translate.meteor_score import single_meteor_score

# PRED_CSV = '../data/csvs/preds_blip_ft.csv' # Change this to evaluate various model predictions
PRED_CSV = '../input/vrdata/data/csvs/preds_blip_ft.csv' # for kaggle

def load_data(filepath):
    df = pd.read_csv(filepath, dtype=str)
    df = df.fillna('')
    return df

def compute_metrics(df):
    # Exact match accuracy
    exacts = (df['answer'].str.strip() == df['prediction'].str.strip()).astype(float)
    exact_acc = exacts.mean()

    # Substring match accuracy
    subs = df.apply(
        lambda row: (row['prediction'].strip() in row['answer'].strip()) or
                    (row['answer'].strip() in row['prediction'].strip()), axis=1
    ).astype(float)
    sub_acc = subs.mean()

    # Levenshtein similarity
    levs = [Levenshtein.ratio(a, p) for a, p in zip(df['answer'], df['prediction'])]
    lev_mean = mean(levs)

    # Meteor score
    meteors = [single_meteor_score(a.split(), p.split()) for a, p in zip(df['answer'], df['prediction'])]
    meteor_mean = mean(meteors)

    # BERTScore F1
    _, _, F = bert_score(df['prediction'].tolist(), df['answer'].tolist(), lang='en', rescale_with_baseline=True)
    bert_f1 = F.mean().item()

    return {
        'EXACT MATCH': exact_acc,
        'SUBSTRING MATCH': sub_acc,
        'BERT F1': bert_f1,
        'LEVENSHTEIN': lev_mean,
        'METEOR': meteor_mean
    }

df = load_data(PRED_CSV)
metrics = compute_metrics(df)
for name, value in metrics.items():
    print(f"{name}: {value:.4f}")