# Rubric Evaluation Notebook

This notebook uses rubric-based evaluation powered by embeddings to assess text responses. It defines criteria for scoring, computes semantic similarity using embeddings, and returns category-based and overall evaluations. The system processes data in batches and outputs both granular and summary insights.
    

In [None]:
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from chromadb import Documents, Embeddings
import google.generativeai as genai
from google.api_core import exceptions
from chromadb.utils.embedding_functions import EmbeddingFunction
    

In [None]:
# Gemini embedding function for documents and queries
class GeminiEmbeddingFunction(EmbeddingFunction):
    document_mode = True  # Set to False for query embedding

    def is_retriable(e):
        return isinstance(e, exceptions.ServiceUnavailable) or \
               isinstance(e, exceptions.ResourceExhausted) or \
               (hasattr(e, 'code') and e.code in {429, 503})

    @retry.Retry(predicate=is_retriable)
    def __call__(self, input: Documents) -> Embeddings:
        embedding_task = "retrieval_document" if self.document_mode else "retrieval_query"
        if isinstance(input, str):
            input = [input]
        return [genai.embed_content(model="models/embedding-001", content=text, task_type=embedding_task)['embedding']
                for text in input]
    

## Simple Rubric Grading (4-level)

This system evaluates prompts based on similarity to a fixed set of rubric descriptors representing 4 grade levels: Excellent, Good, Average, and Poor.
    

In [None]:
class RubricGrader:
    def __init__(self):
        self.embedding_function = GeminiEmbeddingFunction()
        self.rubric_prompts = {
            "Excellent": "Well-organized, thorough, in-depth analysis with strong supporting evidence.",
            "Good": "Covers most points with clarity, some minor lack of depth.",
            "Average": "Basic response with limited analysis or structure.",
            "Poor": "Incomplete, incoherent, or missing key points."
        }
        self.rubric_embeddings = self._compute_rubric_embeddings()

    def _compute_rubric_embeddings(self):
        descriptions = list(self.rubric_prompts.values())
        embeddings = self.embedding_function(descriptions)
        return {grade: emb for grade, emb in zip(self.rubric_prompts.keys(), embeddings)}

    def grade_submission(self, submission_prompt: str):
        embedding = self.embedding_function([submission_prompt])[0]
        best_grade, best_sim = max(
            ((grade, cosine_similarity([embedding], [rub_emb])[0][0])
             for grade, rub_emb in self.rubric_embeddings.items()),
            key=lambda x: x[1]
        )
        return best_grade, best_sim
    

## Fine-grained Evaluation with Rubric Categories

Each response is evaluated on four dimensions: Relevance, Accuracy, Completeness, Clarity. Each dimension is scored from 1 to 5 based on similarity to descriptive anchors.
    

In [None]:
class ResponseEvaluator:
    def __init__(self):
        self.embedding_function = GeminiEmbeddingFunction()
        self.category_rubrics = {
            'Relevance': {
                5: "Fully addresses all prompt aspects with perfect alignment.",
                4: "Covers main points with strong relevance.",
                3: "Some relevant content with partial alignment.",
                2: "Significant deviation from prompt.",
                1: "Fails to address prompt."
            },
            'Accuracy': {
                5: "Completely factual and verifiable.",
                4: "Mostly accurate with minor errors.",
                3: "Mixed accuracy and issues.",
                2: "Contains multiple inaccuracies.",
                1: "Largely incorrect."
            },
            'Completeness': {
                5: "Covers all elements thoroughly with insight.",
                4: "Thoroughly covers main points.",
                3: "Covers most elements, some gaps.",
                2: "Misses several elements.",
                1: "Significantly incomplete."
            },
            'Clarity': {
                5: "Exceptionally clear and well-structured.",
                4: "Well-organized and understandable.",
                3: "Understandable but some disorganization.",
                2: "Poorly structured or vague.",
                1: "Confusing and unclear."
            }
        }
        self.category_embeddings = self._compute_rubric_embeddings()

    def _compute_rubric_embeddings(self):
        return {
            cat: {lvl: self.embedding_function([desc])[0] for lvl, desc in levels.items()}
            for cat, levels in self.category_rubrics.items()
        }

    def evaluate_response(self, response: str):
        resp_emb = self.embedding_function([response])[0]
        scores, similarities = {}, {}
        for cat, level_embs in self.category_embeddings.items():
            scored_levels = {lvl: cosine_similarity([resp_emb], [emb])[0][0] for lvl, emb in level_embs.items()}
            best_lvl = max(scored_levels.items(), key=lambda x: x[1])
            scores[cat], similarities[cat] = best_lvl
        return {
            'scores': scores,
            'similarities': similarities,
            'overall_score': np.mean(list(scores.values())),
            'strongest_category': max(similarities.items(), key=lambda x: x[1])[0],
            'weakest_category': min(similarities.items(), key=lambda x: x[1])[0]
        }
    