In [1]:
import re
import statistics
import random
from typing import Dict, Any, List, Tuple

In [None]:
#https://www.confident-ai.com/blog/llm-evaluation-metrics-everything-you-need-for-llm-evaluation#task-completion
#https://github.com/Arize-ai/phoenix/blob/main/tutorials/evals/optimizing_llm_as_a_judge_prompts.ipynb
#https://www.confident-ai.com/blog/why-llm-as-a-judge-is-the-best-llm-evaluation-method#chain-of-thought-prompting
#https://huggingface.co/prometheus-eval/prometheus-7b-v2.0

In [24]:
# --- Fill this with your LLM client call. Keep temperature low for judging.
def call_llm_system(prompt: str, temperature: float = 0.0) -> str:
    """
    Replace this with your LLM SDK call (OpenAI, Anthropic, local LLM wrapper).
    Return the raw string response.
    """
    # Example placeholder (mock)
    # In real code, call your client, e.g. client.chat(messages=[...], temperature=temperature)
    # Here we simulate a reply for demonstration
    return "Score: 4.0\nJustification: Answer is mostly correct but misses one small point."

# --- Prompt builder
def build_rubric_prompt(task: str, question: str, candidate: str, rubric: Dict[str, Dict]) -> str:
    rubric_text = []
    for k, v in rubric.items():
        rubric_text.append(f"- {k}: {v['description']} (0-5)")
    rubric_block = "\n".join(rubric_text)
    prompt = f"""
You are an expert evaluator. Evaluate the following candidate answer for the task below using the rubric.
Task: {task}
Question: {question}

Candidate answer:
{candidate}

Rubric:
{rubric_block}

Return a JSON object with numeric scores for each rubric dimension (0-5) and an overall numeric score 0-5.
Output only valid JSON, example:
{{"correctness": 4.0, "completeness": 3.0, "clarity": 4.5, "overall": 4.0}}
"""
    return prompt.strip()

# --- Parser: try to extract JSON-like dict from LLM response (robust)
def parse_scores_from_text(text: str) -> Dict[str, float]:
    # try to find a JSON object
    m = re.search(r"\{.*\}", text, flags=re.S)
    if m:
        json_text = m.group(0)
        try:
            import json
            parsed = json.loads(json_text)
            return {k: float(v) for k, v in parsed.items()}
        except Exception:
            pass
    # fallback: extract "Score: X" patterns
    scores = {}
    m = re.search(r"score[:\s]+([0-9]+(?:\.[0-9]+)?)", text, flags=re.I)
    if m:
        scores["overall"] = float(m.group(1))
    # fallback nothing -> raise
    if not scores:
        raise ValueError("Could not parse numeric scores from LLM response: " + text[:200])
    return scores

# --- Single-judge invocation with retries
def judge_single(candidate: str, question: str, task: str, rubric: Dict[str, Dict], retries: int = 2) -> Dict[str, float]:
    prompt = build_rubric_prompt(task, question, candidate, rubric)
    for _ in range(retries):
        raw = call_llm_system(prompt, temperature=0.0)
        try:
            scores = parse_scores_from_text(raw)
            return scores
        except Exception as e:
            # retry once; in prod log the raw text
            continue
    # final fallback: return very negative or NaN sentinel
    return {"overall": float("nan")}

# --- Aggregate multiple judges (ensembling for stability)
def ensemble_judge(candidate: str, question: str, task: str, rubric: Dict[str, Dict], n: int = 3) -> Dict[str, Any]:
    all_scores = []
    for i in range(n):
        s = judge_single(candidate, question, task, rubric)
        all_scores.append(s)
    # compute means for each key
    keys = set().union(*[set(d.keys()) for d in all_scores])
    agg = {}
    for k in keys:
        vals = [d[k] for d in all_scores if k in d and (d[k] == d[k])]  # filter NaN
        agg[k + "_mean"] = statistics.mean(vals) if vals else float("nan")
        agg[k + "_median"] = statistics.median(vals) if vals else float("nan")
        agg[k + "_n"] = len(vals)
    return {"individual": all_scores, "agg": agg}

# --- Bootstrap CI for the overall score
def bootstrap_ci(values: List[float], n_bootstrap: int = 1000, alpha: float = 0.05) -> Tuple[float, float]:
    if not values:
        return (float("nan"), float("nan"))
    boot_means = []
    for _ in range(n_bootstrap):
        sample = [random.choice(values) for _ in values]
        boot_means.append(statistics.mean(sample))
    lower = statistics.quantiles(boot_means, n=100)[int((alpha/2)*100)-1]
    upper = statistics.quantiles(boot_means, n=100)[int((1-alpha/2)*100)-1]
    return lower, upper

In [25]:
# Example usage (mock)
if __name__ == "__main__":
    rubric = {
        "correctness": {"description": "Factual correctness and accuracy"},
        "completeness": {"description": "Covers requested points"},
        "clarity": {"description": "Easy to understand"},
        "overall": {"description": "Overall quality"}
    }
    task = "Answer the question concisely and correctly."
    question = "What is the capital of France?"
    candidate = "The capital of France is Paris."

    result = ensemble_judge(candidate, question, task, rubric, n=3)
    print(result)

{'individual': [{'overall': 4.0}, {'overall': 4.0}, {'overall': 4.0}], 'agg': {'overall_mean': 4.0, 'overall_median': 4.0, 'overall_n': 3}}


In [6]:
task = "Answer the question concisely and correctly."
question = "What is the capital of France?"
candidate = "The capital of France is Paris."

In [7]:
# --- Fill this with your LLM client call. Keep temperature low for judging.
def call_llm_system(prompt: str, temperature: float = 0.0) -> str:
    """
    Replace this with your LLM SDK call (OpenAI, Anthropic, local LLM wrapper).
    Return the raw string response.
    """
    # Example placeholder (mock)
    # In real code, call your client, e.g. client.chat(messages=[...], temperature=temperature)
    # Here we simulate a reply for demonstration
    return "Score: 4.0\nJustification: Answer is mostly correct but misses one small point."

# --- Prompt builder
def build_rubric_prompt(task: str, question: str, candidate: str, rubric: Dict[str, Dict]) -> str:
    rubric_text = []
    for k, v in rubric.items():
        rubric_text.append(f"- {k}: {v['description']} (0-5)")
    rubric_block = "\n".join(rubric_text)
    prompt = f"""
You are an expert evaluator. Evaluate the following candidate answer for the task below using the rubric.
Task: {task}
Question: {question}

Candidate answer:
{candidate}

Rubric:
{rubric_block}

Return a JSON object with numeric scores for each rubric dimension (0-5) and an overall numeric score 0-5.
Output only valid JSON, example:
{{"correctness": 4.0, "completeness": 3.0, "clarity": 4.5, "overall": 4.0}}
"""
    return prompt.strip()

In [8]:
print(build_rubric_prompt(task, question, candidate, rubric))

You are an expert evaluator. Evaluate the following candidate answer for the task below using the rubric.
Task: Answer the question concisely and correctly.
Question: What is the capital of France?

Candidate answer:
The capital of France is Paris.

Rubric:
- correctness: Factual correctness and accuracy (0-5)
- completeness: Covers requested points (0-5)
- clarity: Easy to understand (0-5)
- overall: Overall quality (0-5)

Return a JSON object with numeric scores for each rubric dimension (0-5) and an overall numeric score 0-5.
Output only valid JSON, example:
{"correctness": 4.0, "completeness": 3.0, "clarity": 4.5, "overall": 4.0}


In [13]:
def judge_single(candidate: str, question: str, task: str, rubric: Dict[str, Dict], retries: int = 2) -> Dict[str, float]:
    prompt = build_rubric_prompt(task, question, candidate, rubric)
    for _ in range(retries):
        raw = call_llm_system(prompt, temperature=0.0)
        print("Raw: ", raw)
        try:
            scores = parse_scores_from_text(raw)
            print("Parsed Scores: ", scores)
            return scores
        except Exception as e:
            # retry once; in prod log the raw text
            continue
    # final fallback: return very negative or NaN sentinel
    return {"overall": float("nan")}


In [18]:
raw = call_llm_system("aaaaaaa", temperature=0.0)
raw

'Score: 4.0\nJustification: Answer is mostly correct but misses one small point.'

In [21]:
re.search(r"score[:\s]+([0-9]+(?:\.[0-9]+)?)", raw, flags=re.I)

<re.Match object; span=(0, 10), match='Score: 4.0'>

In [22]:
def parse_scores_from_text(text: str) -> Dict[str, float]:
    # try to find a JSON object
    m = re.search(r"\{.*\}", text, flags=re.S)
    if m:
        json_text = m.group(0)
        try:
            import json
            parsed = json.loads(json_text)
            return {k: float(v) for k, v in parsed.items()}
        except Exception:
            pass
    # fallback: extract "Score: X" patterns
    scores = {}
    m = re.search(r"score[:\s]+([0-9]+(?:\.[0-9]+)?)", text, flags=re.I)
    if m:
        scores["overall"] = float(m.group(1))
    # fallback nothing -> raise
    if not scores:
        raise ValueError("Could not parse numeric scores from LLM response: " + text[:200])
    return scores

In [23]:
parse_scores_from_text(raw)

{'overall': 4.0}