In [1]:
#1
# Standard imports
import json, re, os
from collections import Counter
from typing import List, Dict, Any, Tuple, Union
import pandas as pd
import numpy as np

pd.set_option("display.max_colwidth", 200)

# Expected files to upload
BASELINE_FILES = [
    "hotpotqa_answers_qwen.json",
    "2wiki_answers_qwen.json",
    "hotpotqa_answers_gemini.json",
    "2wiki_answers_gemini.json",
]
PROPOSED_FILES = [
    "hotpotqa_answers_qwen_proposed.json",
    "hotpotqa_answers_gemini_proposed.json",
]

# --------- Loaders ---------
def load_json_records(path: str) -> List[Dict[str, Any]]:
    """Load a JSON array file, a single JSON object (wrapped), or NDJSON."""
    with open(path, "r", encoding="utf-8") as f:
        text = f.read().strip()
    if not text:
        return []
    # Try JSON (array/object)
    try:
        obj = json.loads(text)
        if isinstance(obj, list):
            return obj
        elif isinstance(obj, dict):
            return [obj]
    except json.JSONDecodeError:
        pass
    # Fallback: NDJSON
    recs = []
    for line in text.splitlines():
        line = line.strip()
        if not line:
            continue
        try:
            rec = json.loads(line)
            if isinstance(rec, dict):
                recs.append(rec)
        except json.JSONDecodeError:
            pass
    return recs

# --------- Normalization & F1 (SQuAD-like) ---------
_ARTICLES = {"a", "an", "the"}
_PUNCT = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""

def normalize_text(s: Union[str, None]) -> str:
    """Lowercase, remove punctuation, remove English articles, normalize whitespace."""
    if s is None:
        return ""
    s = s.lower()
    s = re.sub(f"[{re.escape(_PUNCT)}]", " ", s)
    tokens = [t for t in s.split() if t not in _ARTICLES]
    return " ".join(tokens)

def tokenize(s: str) -> List[str]:
    return normalize_text(s).split()

def f1_token_level(pred: str, truth: str) -> float:
    """Token-level F1 using bag-of-words overlap (Counter)."""
    p, t = tokenize(pred), tokenize(truth)
    if not p and not t:
        return 1.0
    if not p or not t:
        return 0.0
    common = sum((Counter(p) & Counter(t)).values())
    if common == 0:
        return 0.0
    precision = common / len(p)
    recall = common / len(t)
    return 2 * precision * recall / (precision + recall)

def pick_prediction(rec: Dict[str, Any]) -> str:
    """Prefer 'response', then 'answer'; else empty string."""
    for k in ("response", "answer"):
        v = rec.get(k)
        if isinstance(v, str) and v.strip():
            return v.strip()
    return ""

def extract_gts(rec: Dict[str, Any]) -> List[str]:
    """Return list of GT candidates. If missing, return [''] to avoid crash."""
    gt = rec.get("gt", None)
    if gt is None:
        return [""]
    if isinstance(gt, str):
        return [gt]
    if isinstance(gt, list):
        return [g if isinstance(g, str) else str(g) for g in gt] or [""]
    return [str(gt)]

def evaluate_records(records: List[Dict[str, Any]], file_tag: str) -> Tuple[pd.DataFrame, float]:
    """Compute per-record F1 and macro-F1 for a list of dicts."""
    rows = []
    for i, rec in enumerate(records):
        pred = pick_prediction(rec)
        gts = extract_gts(rec)
        f1 = max((f1_token_level(pred, gt) for gt in gts), default=0.0)
        rows.append({
            "file": file_tag,
            "idx": i,
            "question": rec.get("question", ""),
            "gt": gts if len(gts) > 1 else (gts[0] if gts else ""),
            "prediction": pred,
            "f1": f1,
        })
    df = pd.DataFrame(rows)
    macro = float(df["f1"].mean()) if len(df) else float("nan")
    return df, macro

def build_gt_bank(baseline_files: List[str]) -> Dict[str, Union[str, List[str]]]:
    """Create a {question -> gt} mapping from baseline files that contain GTs."""
    bank = {}
    for fp in baseline_files:
        if not os.path.exists(fp):
            continue
        for rec in load_json_records(fp):
            q = (rec.get("question") or "").strip()
            if not q:
                continue
            gt = rec.get("gt", None)
            if gt is None:
                continue
            bank[q] = gt if isinstance(gt, list) else str(gt)
    return bank


In [2]:
#2
# Upload all six files at once
from google.colab import files

print("Please select ALL 6 JSON files (multi-select).")
uploaded = files.upload()
print("Uploaded:", list(uploaded.keys()))


Please select ALL 6 JSON files (multi-select).


Uploaded: []


In [3]:
#3
# 1) Build GT bank from baseline files
gt_bank = build_gt_bank(BASELINE_FILES)

# 2) Load proposed files and inject GTs by exact question match (if GT missing)
proposed_loaded = {}
patched, total, unmatched = 0, 0, 0

for fp in PROPOSED_FILES:
    if not os.path.exists(fp):
        proposed_loaded[fp] = []
        continue
    recs = load_json_records(fp)
    for rec in recs:
        total += 1
        has_gt = rec.get("gt", None) not in (None, "", [])
        if not has_gt:
            q = (rec.get("question") or "").strip()
            if q in gt_bank:
                rec["gt"] = gt_bank[q]
                patched += 1
            else:
                unmatched += 1
    proposed_loaded[fp] = recs

# 3) Evaluate per-file and overall
rows = []
dfs = []
for fp, recs in proposed_loaded.items():
    df, macro = evaluate_records(recs, os.path.basename(fp))
    dfs.append(df)
    rows.append({"file": os.path.basename(fp), "num_records": len(df), "macro_f1": macro})

proposed_summary = pd.DataFrame(rows).sort_values("file").reset_index(drop=True)

In [8]:
#4
print(f"Injected GTs for proposed: patched={patched} / total={total}, unmatched_questions={unmatched}")
display(proposed_summary)

Injected GTs for proposed: patched=200 / total=200, unmatched_questions=0


Unnamed: 0,file,num_records,macro_f1
0,hotpotqa_answers_gemini_proposed.json,100,0.326761
1,hotpotqa_answers_qwen_proposed.json,100,0.386758


In [5]:
#5
baseline_loaded = {}
for fp in BASELINE_FILES:
    baseline_loaded[fp] = load_json_records(fp) if os.path.exists(fp) else []

# Evaluate per-file and also per-model aggregates in the same output
rows = []
dfs = []
for fp, recs in baseline_loaded.items():
    df, macro = evaluate_records(recs, os.path.basename(fp))
    dfs.append(df)
    rows.append({"file": os.path.basename(fp), "num_records": len(df), "macro_f1": macro})

baseline_summary = pd.DataFrame(rows).sort_values("file").reset_index(drop=True)



In [6]:
#6
display(baseline_summary)

Unnamed: 0,file,num_records,macro_f1
0,2wiki_answers_gemini.json,100,0.359908
1,2wiki_answers_qwen.json,100,0.211611
2,hotpotqa_answers_gemini.json,100,0.530597
3,hotpotqa_answers_qwen.json,100,0.402237
