In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip install -q sentence-transformers tqdm


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import json, shutil
from pathlib import Path
from tqdm import tqdm
import torch
from sentence_transformers import SentenceTransformer, util

drive_file = Path('/content/drive/MyDrive/HotpotQA_snapshot/hotpotqa_answers_qwen.json')
assert drive_file.exists(), f"{drive_file} not found."

work_dir = Path('/content/llm_eval')
work_dir.mkdir(parents=True, exist_ok=True)
local_file = work_dir / drive_file.name
shutil.copy2(drive_file, local_file)
print('✅ File copied to', local_file)


✅ File copied to /content/llm_eval/hotpotqa_answers_qwen.json


In [8]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def evaluate_em(json_path, sim_threshold=0.9):
    with open(json_path, 'r') as f:
        data = json.load(f)

    model = SentenceTransformer(
        "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
        device=DEVICE
    )

    total = correct = 0
    total_bridge = correct_bridge = 0
    total_comp   = correct_comp   = 0

    for rec in tqdm(data, desc="Evaluating"):
        ev_texts = rec.get('evidence', [])
        if not ev_texts:          
            continue

        ev_embs = model.encode(ev_texts, device=DEVICE)

        for support in rec.get('supports', []):
            sent = support[1] if isinstance(support, list) else support
            s_emb = model.encode(sent, device=DEVICE)
            sims  = util.dot_score(s_emb, ev_embs).cpu().numpy().flatten()
            hit   = sims.max() > sim_threshold   

            total += 1
            if hit:
                correct += 1

            if rec.get('type') == 'bridge':
                total_bridge += 1
                if hit: correct_bridge += 1
            else:           
                total_comp += 1
                if hit: correct_comp += 1

    return {
        'EM_all':        correct / total if total else 0,
        'EM_bridge':     correct_bridge / total_bridge if total_bridge else 0,
        'EM_comparison': correct_comp / total_comp if total_comp else 0
    }


In [9]:
results = evaluate_em(local_file)

print("\n🔹 HotpotQA EM Scores:")
print(f"  • All supports:           {results['EM_all']:.4f}")
print(f"  • Bridge‐type supports:   {results['EM_bridge']:.4f}")
print(f"  • Comparison‐type supports: {results['EM_comparison']:.4f}")


Evaluating: 100%|██████████| 100/100 [00:03<00:00, 30.12it/s]


🔹 HotpotQA EM Scores:
  • All supports:           0.4286
  • Bridge‐type supports:   0.4234
  • Comparison‐type supports: 0.4352



