In [4]:
import pandas as pd
import string
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def normalize(text):
    if not isinstance(text, str):
        text = str(text) if text is not None else ""
    text = text.lower().translate(str.maketrans('', '', string.punctuation)).strip()
    return text

def evaluate_bleu_csv(path_csv, bleu_type=1):
    df = pd.read_csv(path_csv)

    references = [normalize(ans).split() for ans in df["answer"]]
    candidates = [normalize(ans).split() for ans in df["generated_answer"]]

    weights = (1.0, 0.0, 0.0, 0.0) if bleu_type == 1 else (0.5, 0.5, 0.0, 0.0)

    smoothie = SmoothingFunction().method4
    bleu_scores = [
        sentence_bleu([ref], cand, weights=weights, smoothing_function=smoothie)
        for ref, cand in zip(references, candidates)
    ]

    return sum(bleu_scores) / len(bleu_scores)

def evaluate_exact_match_csv(path_csv):
    df = pd.read_csv(path_csv)

    def norm(s):
        if not isinstance(s, str):
            s = str(s) if s is not None else ""
        return s.lower().translate(str.maketrans('', '', string.punctuation)).strip()

    refs = df["answer"].fillna("").astype(str).apply(norm)
    preds = df["generated_answer"].fillna("").astype(str).apply(norm)

    exact_match = (refs == preds).sum() / len(refs)
    return exact_match

In [None]:
csv_simple = "generated_responses_simple.csv"
csv_hyde = "generated_responses_simple_HyDE.csv"
csv_fusion = "generated_responses_simple_Fusion.csv"

bleu_simple_1 = evaluate_bleu_csv(csv_simple, bleu_type=1)
bleu_simple_2 = evaluate_bleu_csv(csv_simple, bleu_type=2)

bleu_hyde_1 = evaluate_bleu_csv(csv_hyde, bleu_type=1)
bleu_hyde_2 = evaluate_bleu_csv(csv_hyde, bleu_type=2)

bleu_fusion_1 = evaluate_bleu_csv(csv_fusion, bleu_type=1)
bleu_fusion_2 = evaluate_bleu_csv(csv_fusion, bleu_type=2)

print(f"[SIMPLE] BLEU-1: {bleu_simple_1:.3f} | BLEU-2: {bleu_simple_2:.3f}")
print(f"[HyDE  ] BLEU-1: {bleu_hyde_1:.3f} | BLEU-2: {bleu_hyde_2:.3f}")
print(f"[Fusion] BLEU-1: {bleu_fusion_1:.3f} | BLEU-2: {bleu_fusion_2:.3f}")

[SIMPLE] BLEU-1: 0.580 | BLEU-2: 0.571
[HyDE  ] BLEU-1: 0.564 | BLEU-2: 0.554
[Fusion] BLEU-1: 0.550 | BLEU-2: 0.538


In [6]:
csv_simple = "generated_responses_simple.csv"
csv_hyde = "generated_responses_simple_HyDE.csv"
csv_fusion = "generated_responses_simple_Fusion.csv"

em_simple = evaluate_exact_match_csv(csv_simple)
em_hyde = evaluate_exact_match_csv(csv_hyde)
em_fusion = evaluate_exact_match_csv(csv_fusion)

print(f"[SIMPLE] Exact Match : {em_simple:.3f}")
print(f"[HyDE  ] Exact Match : {em_hyde:.3f}")
print(f"[Fusion] Exact Match : {em_fusion:.3f}")

[SIMPLE] Exact Match : 0.527
[HyDE  ] Exact Match : 0.510
[Fusion] Exact Match : 0.490
