# Metrics:
 - Embedding similarity
 - Rouge-L
 - LLM score

In [68]:
from utils.api_clients import get_embedding
import numpy as np
from rouge_score import rouge_scorer
import re

def embedding_similarity(text1, text2):
    embeddings1 = np.array(get_embedding(text1))
    embeddings2 = np.array(get_embedding(text2))

    cosine_sim = np.dot(embeddings1, embeddings2) / (
        np.linalg.norm(embeddings1) * np.linalg.norm(embeddings2)
    )
    return float(cosine_sim)

def rouge_l_score(text1, text2):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    score = scorer.score(text1, text2)
    return float(score['rougeL'].fmeasure)

def llm_score(rubric1, rubric2, rubric_level, question):
    from utils.prompt_processing import run_prompt

    prompt_vars = {
        "rubric1": rubric1,
        "rubric2": rubric2,
        "rubric_level": rubric_level,
        "question": question
    }

    resp = run_prompt(
        template_path="prompts/evaluation.yaml",
        template_key="rubric_evaluation",
        template_vars=prompt_vars,
        model="openai/gpt-5-mini",
        base_temperature=0.0,
    )

    score_match = re.search(r"Final\s*score:\s*([0-9.]+)", resp, re.IGNORECASE)
    if score_match:
        return float(score_match.group(1)) / 5.0  # Normalize to 0-1 scale
    else:
        return None

def compute_score(gen_rubric, ref_rubric, question=""):
    """
    Compare generated rubric vs reference rubric across all levels.
    Returns:
        dict with per-level and overall average scores.
    """
    results = {}
    embedding_scores = []
    rouge_scores = []
    llm_scores = []

    for level in gen_rubric:
        gen_text = gen_rubric[level]
        ref_text = ref_rubric.get(level, "")

        if not ref_text:
            # Skip levels missing from reference
            continue

        emb_sim = embedding_similarity(gen_text, ref_text)
        rouge_sim = rouge_l_score(gen_text, ref_text)
        llm_sim = llm_score(ref_text, gen_text, level, question)

        results[level] = {
            "embedding_similarity": emb_sim,
            "rougeL": rouge_sim,
            "llm_score": llm_sim,
        }

        embedding_scores.append(emb_sim)
        rouge_scores.append(rouge_sim)
        if llm_sim is not None:
            llm_scores.append(llm_sim)

    # Compute overall averages
    results["average"] = {
        "embedding_similarity": np.mean(embedding_scores) if embedding_scores else 0.0,
        "rougeL": np.mean(rouge_scores) if rouge_scores else 0.0,
        "llm_score": np.mean(llm_scores) if llm_scores else 0.0,
    }

    return results

In [54]:
question = "Describe the role of mitochondria in cellular respiration."
rubric = {
    "Emerging": "Shows minimal understanding; major misconceptions or irrelevant response.",
    "Developing": "Demonstrates basic understanding; contains some errors or incomplete explanation.",
    "Proficient": "Shows good understanding; mostly accurate with minor omissions or inaccuracies.",
    "Advanced": "Demonstrates excellent understanding with a complete and accurate explanation."
}
rubric2 = {
    "Emerging": "Shows limited understanding with several misconceptions or irrelevant statements.",
    "Developing": "Demonstrates some understanding; explanation is basic and may include minor errors or missing details.",
    "Proficient": "Shows clear understanding; mostly accurate with only a few small omissions.",
    "Advanced": "Demonstrates strong understanding with a thorough and precise explanation of the mitochondria's role in respiration."
}

In [69]:
compute_score(rubric, rubric2, question)

{'Emerging': {'embedding_similarity': 0.846237032925695,
  'rougeL': 0.5882352941176471,
  'llm_score': 0.8},
 'Developing': {'embedding_similarity': 0.9184320083035457,
  'rougeL': 0.34782608695652173,
  'llm_score': 1.0},
 'Proficient': {'embedding_similarity': 0.8920768122003521,
  'rougeL': 0.5714285714285713,
  'llm_score': 0.8},
 'Advanced': {'embedding_similarity': 0.6308014881395728,
  'rougeL': 0.4800000000000001,
  'llm_score': 1.0},
 'average': {'embedding_similarity': 0.8218868353922913,
  'rougeL': 0.4968724881256851,
  'llm_score': 0.9}}

In [77]:
import numpy as np
import pandas as pd
from tqdm import tqdm

def evaluate_models(model_rubric_sets, ref_rubrics, questions):
    """
    model_rubric_sets: dict like {
        'GPT': [gen_rubric1, gen_rubric2, ...],
        'Gemini': [...],
        'Mistral': [...]
    }
    ref_rubrics: list of reference rubrics (same length as each model's list)
    questions: list of question strings (same length as ref_rubrics)
    """

    all_results = {}  # model -> list of per-item results

    for model_name, gen_rubrics in model_rubric_sets.items():
        model_scores = []
        print(f"Evaluating model: {model_name}")

        for ref_rubric, gen_rubric, question in tqdm(zip(ref_rubrics, gen_rubrics, questions)):
            res = compute_score(gen_rubric, ref_rubric, question)
            model_scores.append(res)

        all_results[model_name] = model_scores

    # Aggregate results
    return summarize_results(all_results)

def summarize_results(all_results):
    """
    Summarize results into two tables:
      - Table 1: Model-level aggregate metrics
      - Table 2: Per-rubric-level metrics
    """
    table1_data = []
    table2_data = []

    # Collect all unique rubric levels (e.g., "comprehensive", "competent", etc.)
    all_levels = set()
    for model_scores in all_results.values():
        for item in model_scores:
            all_levels.update([k for k in item.keys() if k != "average"])
    all_levels = sorted(list(all_levels))

    for model_name, model_scores in all_results.items():
        # --- Table 1 (aggregate) ---
        emb_scores, rouge_scores, llm_scores = [], [], []

        # --- Table 2 (per-level) ---
        level_scores = {lvl: [] for lvl in all_levels}

        for res in model_scores:
            avg = res.get("average", {})
            if avg:
                emb_scores.append(avg.get("embedding_similarity", 0))
                rouge_scores.append(avg.get("rougeL", 0))
                llm_scores.append(avg.get("llm_score", 0))

            # collect per-level
            for lvl in all_levels:
                if lvl in res:
                    lvl_avg = np.mean([
                        res[lvl].get("embedding_similarity", 0),
                        res[lvl].get("rougeL", 0),
                        res[lvl].get("llm_score", 0)
                    ])
                    level_scores[lvl].append(lvl_avg)

        # --- Table 1 row ---
        mean_emb = np.mean(emb_scores) if emb_scores else 0
        mean_rouge = np.mean(rouge_scores) if rouge_scores else 0
        mean_llm = np.mean(llm_scores) if llm_scores else 0
        overall_mean = np.mean([mean_emb, mean_rouge, mean_llm])
        table1_data.append([model_name, mean_emb, mean_rouge, mean_llm, overall_mean])

    # --- Convert to DataFrames for nice tabular display ---
    table1 = pd.DataFrame(table1_data, columns=["Models", "Embedding_score", "ROUGEL", "LLM_score", "Mean"])
    print("\n📊 Table 1: Overall Model Performance")
    print(table1.round(3).to_string(index=False))

    return table1

In [82]:
import csv
import json
import time

def load_data_from_csv(file_path):
    with open(file_path, mode='r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        data = [row for row in csv_reader]
        for entry in data:
            rubric_levels = ['Comprehensive response', 'Competent response',
                            'Partial response', 'Limited response']
            rubric = {level: entry[level] for level in rubric_levels if level in entry}
            entry['rubric'] = rubric
    return data

In [None]:
questions   = [ref['question'] for ref in load_data_from_csv('dataset/gold_samples.csv')]
ref_rubrics = [ref['rubric'] for ref in load_data_from_csv('dataset/gold_samples.csv')]
gpt4_rubrics = [gen['rubric'] for gen in load_data_from_csv('dataset/gpt4_gen_samples.csv')]
mistral_rubrics = [gen['rubric'] for gen in load_data_from_csv('dataset/mistral_gen_samples.csv')]
gemini_rubrics = [gen['rubric'] for gen in load_data_from_csv('dataset/gemini_gen_samples.csv')]

In [84]:
model_rubric_sets = {
    "GPT4": gpt4_rubrics,
    "Mistral": mistral_rubrics,
    "Gemini": gemini_rubrics
}

table1 = evaluate_models(model_rubric_sets, ref_rubrics, questions)
import pickle
pickle.dump(table1, open('evaluation_table1.pkl','wb'))

Evaluating model: GPT4


12it [06:01, 30.12s/it]


Evaluating model: Mistral


12it [06:30, 32.54s/it]


Evaluating model: Gemini


12it [06:50, 34.20s/it]


📊 Table 1: Overall Model Performance
 Models  Embedding_score  ROUGEL  LLM_score  Mean
   GPT4            0.622   0.206      0.779 0.536
Mistral            0.612   0.158      0.733 0.501
 Gemini            0.624   0.213      0.767 0.534



