In [None]:
import Levenshtein
import pandas as pd
from statistics import mean
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from nltk.translate.meteor_score import single_meteor_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

PRED_CSV = '../data/predictions.csv'

def load_data(filepath):
    df = pd.read_csv(filepath, dtype=str)
    df = df.fillna('')
    return df

def compute_metrics(df):
    exacts = (df['answer'].str.strip() == df['prediction'].str.strip()).astype(float)
    subs = df.apply(
        lambda row: (row['prediction'].strip() in row['answer'].strip()) or
                    (row['answer'].strip() in row['prediction'].strip()), axis=1).astype(float)
    exact_acc = exacts.mean()
    sub_acc = subs.mean()
    exact_f1 = exact_acc
    sub_f1 = sub_acc

    token_f1s, levs, meteors = [], [], []
    bleu1s, bleu2s, bleu3s, bleu4s = [], [], [], []
    r1_f1s, rl_f1s = [], []

    smoothie = SmoothingFunction().method1
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

    for row in df.itertuples(index=False):
        a, p = row.answer, row.prediction
        atoks = a.split()
        ptoks = p.split()
        common = set(atoks) & set(ptoks)
        precision = len(common) / len(ptoks) if ptoks else 0
        recall = len(common) / len(atoks) if atoks else 0
        tf1 = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0
        token_f1s.append(tf1)

        levs.append(Levenshtein.ratio(a, p))

        bleu1s.append(sentence_bleu([atoks], ptoks, weights=(1, 0, 0, 0), smoothing_function=smoothie))
        bleu2s.append(sentence_bleu([atoks], ptoks, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie))
        bleu3s.append(sentence_bleu([atoks], ptoks, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothie))
        bleu4s.append(sentence_bleu([atoks], ptoks, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie))

        meteors.append(single_meteor_score(a.split(), p.split()))

        scores = scorer.score(a, p)
        r1_f1s.append(scores['rouge1'].fmeasure)
        rl_f1s.append(scores['rougeL'].fmeasure)

    _, _, F = bert_score(df['prediction'].tolist(), df['answer'].tolist(), lang='en', rescale_with_baseline=True)
    bert_f1 = F.mean().item()

    return {
        'Exact Match Acc': exact_acc,
        'Substring Match Acc': sub_acc,
        'Exact F1': exact_f1,
        'Substring F1': sub_f1,
        'Token-level Macro F1': mean(token_f1s),
        'BERTScore F1': bert_f1,
        'ROUGE-1 F1': mean(r1_f1s),
        'ROUGE-L F1': mean(rl_f1s),
        'BLEU-1': mean(bleu1s),
        'BLEU-2': mean(bleu2s),
        'BLEU-3': mean(bleu3s),
        'BLEU-4': mean(bleu4s),
        'Levenshtein Similarity': mean(levs),
        'METEOR': mean(meteors)
    }

df = load_data(PRED_CSV)
metrics = compute_metrics(df)
for name, value in metrics.items():
    print(f"{name}: {value:.4f}")