<a href="https://colab.research.google.com/github/m-zayed5722/Miscellaneous-Projects/blob/main/EvalForge_Lite.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install pandas numpy rapidfuzz


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/3.2 MB[0m [31m8.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━[0m [32m2.6/3.2 MB[0m [31m38.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import re
import math
import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import Optional, List, Dict, Tuple
from rapidfuzz import fuzz


In [3]:
@dataclass
class JudgeScore:
    correctness: float   # 0-5
    completeness: float  # 0-5
    clarity: float       # 0-5
    faithfulness: float  # 0-5 (to provided context, if any)
    notes: str           # short justification
    winner: Optional[str] = None  # "A", "B", or None

    @property
    def total(self) -> float:
        return self.correctness + self.completeness + self.clarity + self.faithfulness

RUBRIC = {
    "correctness": "Is it factually correct given the reference (if provided)?",
    "completeness": "Does it fully answer the question?",
    "clarity": "Is it well-structured and easy to understand?",
    "faithfulness": "Does it stay grounded to the provided context (if any), avoiding extra claims?"
}


In [4]:
HEDGING = {"maybe","might","could","possibly","somehow","i think","probably"}
FLUFF = {"as an ai","i cannot","i’m unable","cannot access","browse the web","i don’t have"}

def normalize(t: str) -> str:
    t = (t or "").strip()
    t = re.sub(r"\s+", " ", t)
    return t

def keywords(text: str) -> set:
    text = normalize(text).lower()
    toks = re.findall(r"[a-z0-9_]+", text)
    stop = {"the","a","an","and","or","to","of","in","on","for","with","is","are","was","were","be","as","at","by","from","that","this","it"}
    return {w for w in toks if w not in stop and len(w) >= 3}

def structure_score(ans: str) -> float:
    ans = normalize(ans)
    bullets = len(re.findall(r"(^|\n)\s*[-*]\s+", ans))
    numbered = len(re.findall(r"(^|\n)\s*\d+\.\s+", ans))
    headings = len(re.findall(r"\n\s*#+\s+", ans))
    return min(5.0, 1.0 + 0.6*bullets + 0.6*numbered + 0.4*headings)

def penalty_score(ans: str) -> float:
    a = normalize(ans).lower()
    p = 0.0
    if any(h in a for h in HEDGING): p += 0.6
    if any(f in a for f in FLUFF): p += 1.0
    if len(a) < 40: p += 1.0
    return min(3.0, p)

def overlap_similarity(a: str, b: str) -> float:
    # returns 0..1 (token similarity)
    ka, kb = keywords(a), keywords(b)
    if not ka or not kb:
        return 0.0
    inter = len(ka & kb)
    union = len(ka | kb)
    return inter / union if union else 0.0

def relevance_to_question(question: str, ans: str) -> float:
    kq = keywords(question)
    ka = keywords(ans)
    if not kq:
        return 0.5
    return min(1.0, len(kq & ka) / max(1, min(len(kq), 10)))

def offline_judge_single(question: str, answer: str, reference: Optional[str]=None, context: Optional[str]=None) -> JudgeScore:
    answer = normalize(answer)
    reference = normalize(reference) if reference else None
    context = normalize(context) if context else None

    rel = relevance_to_question(question, answer)  # 0..1
    struct = structure_score(answer)               # 1..5
    pen = penalty_score(answer)                    # 0..3

    # correctness/completeness: best-effort proxy via overlap with reference (if available)
    if reference:
        sim_ref = overlap_similarity(answer, reference)  # 0..1
        correctness = 1.5 + 3.5 * sim_ref
        completeness = 1.5 + 3.5 * sim_ref
        notes_bits = [f"ref_overlap={sim_ref:.2f}"]
    else:
        correctness = 2.0 + 3.0 * rel
        completeness = 2.0 + 3.0 * rel
        notes_bits = [f"q_relevance={rel:.2f}"]

    # clarity: structure minus penalties
    clarity = max(0.0, min(5.0, struct - 0.6*pen))

    # faithfulness: overlap with context if provided; otherwise neutral
    if context:
        sim_ctx = overlap_similarity(answer, context)
        faithfulness = 2.0 + 3.0 * sim_ctx
        notes_bits.append(f"ctx_overlap={sim_ctx:.2f}")
    else:
        faithfulness = 3.0  # neutral

    # apply penalty lightly to correctness & completeness if too short/fluffy
    correctness = max(0.0, min(5.0, correctness - 0.35*pen))
    completeness = max(0.0, min(5.0, completeness - 0.35*pen))
    faithfulness = max(0.0, min(5.0, faithfulness - 0.25*pen))

    notes_bits.append(f"penalty={pen:.2f}")
    return JudgeScore(
        correctness=float(correctness),
        completeness=float(completeness),
        clarity=float(clarity),
        faithfulness=float(faithfulness),
        notes="; ".join(notes_bits)
    )

def offline_judge_pair(question: str, ans_a: str, ans_b: str, reference: Optional[str]=None, context: Optional[str]=None) -> JudgeScore:
    sa = offline_judge_single(question, ans_a, reference, context)
    sb = offline_judge_single(question, ans_b, reference, context)

    if abs(sa.total - sb.total) < 0.35:
        winner = None
    else:
        winner = "A" if sa.total > sb.total else "B"

    return JudgeScore(
        correctness=round((sa.correctness + sb.correctness)/2, 3),   # not used for pair; kept for structure
        completeness=round((sa.completeness + sb.completeness)/2, 3),
        clarity=round((sa.clarity + sb.clarity)/2, 3),
        faithfulness=round((sa.faithfulness + sb.faithfulness)/2, 3),
        notes=f"A_total={sa.total:.2f} ({sa.notes}) | B_total={sb.total:.2f} ({sb.notes})",
        winner=winner
    ), sa, sb


In [5]:
def evaluate_question(
    question: str,
    answers: Dict[str, str],
    reference: Optional[str] = None,
    context: Optional[str] = None
) -> pd.DataFrame:
    """
    answers: dict like {"A": "...", "B": "...", "C": "..."}
    Returns a table of per-answer scores using offline judge.
    """
    rows = []
    for label, ans in answers.items():
        s = offline_judge_single(question, ans, reference=reference, context=context)
        rows.append({
            "label": label,
            "correctness": s.correctness,
            "completeness": s.completeness,
            "clarity": s.clarity,
            "faithfulness": s.faithfulness,
            "total": s.total,
            "notes": s.notes,
        })
    df = pd.DataFrame(rows).sort_values("total", ascending=False).reset_index(drop=True)
    return df

def pairwise_battles(
    question: str,
    answers: Dict[str, str],
    reference: Optional[str] = None,
    context: Optional[str] = None
) -> pd.DataFrame:
    labels = list(answers.keys())
    battle_rows = []
    for i in range(len(labels)):
        for j in range(i+1, len(labels)):
            a, b = labels[i], labels[j]
            pair_score, sa, sb = offline_judge_pair(
                question, answers[a], answers[b], reference=reference, context=context
            )
            battle_rows.append({
                "A": a,
                "B": b,
                "winner": pair_score.winner,
                "A_total": sa.total,
                "B_total": sb.total,
                "margin": abs(sa.total - sb.total),
                "details": pair_score.notes
            })
    return pd.DataFrame(battle_rows).sort_values(["margin"], ascending=False).reset_index(drop=True)

def winrate_from_battles(battles_df: pd.DataFrame, labels: List[str]) -> pd.DataFrame:
    wins = {l: 0 for l in labels}
    ties = {l: 0 for l in labels}
    games = {l: 0 for l in labels}

    for _, r in battles_df.iterrows():
        a, b, w = r["A"], r["B"], r["winner"]
        games[a] += 1; games[b] += 1
        if w == "A":
            wins[a] += 1
        elif w == "B":
            wins[b] += 1
        else:
            ties[a] += 1
            ties[b] += 1

    rows = []
    for l in labels:
        g = games[l]
        w = wins[l]
        t = ties[l]
        win_rate = (w + 0.5*t) / g if g else 0.0
        rows.append({"label": l, "wins": w, "ties": t, "games": g, "win_rate": round(win_rate, 3)})
    return pd.DataFrame(rows).sort_values("win_rate", ascending=False).reset_index(drop=True)


In [6]:
EVAL_SET = [
    {
        "id": "q1",
        "question": "Explain RAG in 3-5 sentences and why citations matter.",
        "reference": "Retrieval-Augmented Generation (RAG) combines retrieving relevant documents with generating an answer grounded in those documents. The retriever finds supporting passages and the generator uses them to produce a response. Citations matter because they show where information came from and help users verify claims. They also reduce hallucinations by encouraging grounded answers.",
        "context": None,
        "answers": {
            "A": "RAG is when a model looks up relevant documents and then writes an answer using them. It helps reduce hallucinations because the answer is grounded in retrieved text. Citations matter because they let the reader verify the source and increase trust. They also help debug retrieval issues.",
            "B": "RAG is basically a fancy AI technique that might use a database. It could help sometimes. Citations are nice for trust, I think.",
            "C": "RAG mixes retrieval with generation: first fetch top-k chunks from a knowledge base, then condition the LLM on those chunks to answer. Citations matter because they link claims to evidence and make it obvious when retrieval didn’t support an answer."
        }
    },
    {
        "id": "q2",
        "question": "Give 4 guardrails for a Text-to-SQL system used by executives.",
        "reference": "Use read-only SQL (SELECT-only) with blocked DDL/DML, enforce table/column allowlists, apply LIMIT and timeouts, and require parameterization/safe filters. Add auditing/logging, PII redaction where needed, and a fallback when confidence is low.",
        "context": None,
        "answers": {
            "A": "- Allowlist tables/columns\n- Block INSERT/UPDATE/DELETE/DROP\n- Always add LIMIT + timeout\n- Log queries + results metadata for audits",
            "B": "Just make sure the model is smart and give it the schema. Executives won’t do anything dangerous.",
            "C": "1) SELECT-only policy + forbid DDL/DML 2) allowlist tables/columns 3) enforce LIMIT + query timeout 4) audit logs + alerting for sensitive queries"
        }
    }
]


In [7]:
all_score_rows = []
all_battle_rows = []

for item in EVAL_SET:
    qid = item["id"]
    q = item["question"]
    ref = item.get("reference")
    ctx = item.get("context")
    answers = item["answers"]

    score_df = evaluate_question(q, answers, reference=ref, context=ctx)
    score_df.insert(0, "qid", qid)
    all_score_rows.append(score_df)

    battles_df = pairwise_battles(q, answers, reference=ref, context=ctx)
    battles_df.insert(0, "qid", qid)
    all_battle_rows.append(battles_df)

scores = pd.concat(all_score_rows, ignore_index=True)
battles = pd.concat(all_battle_rows, ignore_index=True)

print("Per-answer scores:")
display(scores)

print("\nPairwise battles:")
display(battles)


Per-answer scores:


Unnamed: 0,qid,label,correctness,completeness,clarity,faithfulness,total,notes
0,q1,A,2.537037,2.537037,1.0,3.0,9.074074,ref_overlap=0.30; penalty=0.00
1,q1,C,2.136364,2.136364,1.0,3.0,8.272727,ref_overlap=0.18; penalty=0.00
2,q1,B,1.504286,1.504286,0.64,2.85,6.498571,ref_overlap=0.06; penalty=0.60
3,q2,C,2.0,2.0,1.0,3.0,8.0,ref_overlap=0.14; penalty=0.00
4,q2,A,1.655556,1.655556,1.6,3.0,7.911111,ref_overlap=0.04; penalty=0.00
5,q2,B,1.5,1.5,1.0,3.0,7.0,ref_overlap=0.00; penalty=0.00



Pairwise battles:


Unnamed: 0,qid,A,B,winner,A_total,B_total,margin,details
0,q1,A,B,A,9.074074,6.498571,2.575503,A_total=9.07 (ref_overlap=0.30; penalty=0.00) ...
1,q1,B,C,B,6.498571,8.272727,1.774156,A_total=6.50 (ref_overlap=0.06; penalty=0.60) ...
2,q1,A,C,A,9.074074,8.272727,0.801347,A_total=9.07 (ref_overlap=0.30; penalty=0.00) ...
3,q2,B,C,B,7.0,8.0,1.0,A_total=7.00 (ref_overlap=0.00; penalty=0.00) ...
4,q2,A,B,A,7.911111,7.0,0.911111,A_total=7.91 (ref_overlap=0.04; penalty=0.00) ...
5,q2,A,C,,7.911111,8.0,0.088889,A_total=7.91 (ref_overlap=0.04; penalty=0.00) ...


In [8]:
def disagreement_index(battles_df: pd.DataFrame) -> float:
    # higher means more ties/close margins -> less decisive judge
    if battles_df.empty:
        return 0.0
    close = (battles_df["margin"] < 0.5).mean()
    ties = (battles_df["winner"].isna()).mean()
    return round(0.6*close + 0.4*ties, 3)

labels = sorted({l for item in EVAL_SET for l in item["answers"].keys()})

wr = winrate_from_battles(battles, labels)
print("Win-rate across the eval set:")
display(wr)

print("Judge disagreement index (0=decisive, 1=very unsure):", disagreement_index(battles))


Win-rate across the eval set:


Unnamed: 0,label,wins,ties,games,win_rate
0,A,3,1,4,0.875
1,C,2,1,4,0.625
2,B,0,0,4,0.0


Judge disagreement index (0=decisive, 1=very unsure): 0.167


In [9]:
USE_LLM_JUDGE = False

LLM_JUDGE_PROMPT = """You are a strict evaluator.
Given:
- A question
- Optional context (evidence)
- Two answers (A and B)

Score each answer from 0-5 on:
1) correctness
2) completeness
3) clarity
4) faithfulness to context (if provided)

Then pick a winner: A, B, or tie.
Return JSON only with keys:
scores: {A:{...}, B:{...}}, winner, short_reason
"""

def llm_judge_pair(question: str, ans_a: str, ans_b: str, context: Optional[str]=None) -> dict:
    raise NotImplementedError("Add your LLM call here and return the required JSON format.")
