# Custom LLM Judges

In [None]:
# Load OpenAI API key
from dotenv import load_dotenv
load_dotenv()


In [None]:
import json
from typing import TypedDict, Any
from abc import ABC
from langchain_openai import ChatOpenAI

# Define LLM judge output structure 
class JudgeOutput(TypedDict):
    score: int 
    justification:str 

# Define superclass 
class LLMJudge(ABC):
    """
    Base class for LLM judges.
    """
    def __init__(self, name: str, prompt: str, model: str = "gpt-5-mini"):
        self.name = name
        self.prompt = prompt
        self.llm = ChatOpenAI(model=model)
    
    def format_prompt(self, answer: str, context: str = "", question: str = "") -> str:
        base = f"Evaluation Task: {self.prompt}\n\n"
        if question:
            base += f"User Question:\n{question}\n\n"
        if context:
            base += f"Reference Context:\n{context}\n\n"
        base += f"Answer to Evaluate:\n{answer}\n\n"
        base += "Respond with a JSON object: {\"score\": 1-5, \"justification\": \"string\"}"
        return base
    
    def evaluate(self, answer: str, context: str = "", question: str = "") -> JudgeOutput:
        """
        Calls LLM 
        """
        prompt = self.format_prompt(answer, context, question)
        response = self.llm.invoke(prompt)
        try:
            parsed = json.loads(response.content)
            return JudgeOutput(score=parsed["score"], justification=parsed["justification"])
        except Exception as e:
            return JudgeOutput(score=0, justification=f"Parsing error: {e}, raw: {response.content}")

# Define custom judges
class ContextRelevanceJudge(LLMJudge):
    def __init__(self, model="gpt-5-mini"):
        super().__init__(
            name="ContextRelevance",
            prompt="""Evaluate how relevant the answer is to the given context. 
            A score of 5 means the response was completely relevant. The context contains the necessary information to answer the Human query.
            A score of 1 means the response was completely irrelevant. The context does not contain information to answer the Human query.""",
            model=model
        )

class AnswerGroundednessJudge(LLMJudge):
    def __init__(self, model="gpt-5-mini"):
        super().__init__(
            name="AnswerGroundedness",
            prompt="""Evaluate how well the answer is grounded in the provided context, avoiding hallucinations.
            A score of 5 means that the generated response is well-grounded in the context. All statements from the answer are supported in the context.
            A score of 1 means that the generated response is not grounded. The statements in the answer are hallucinated and not supported by the context.""",
            model=model
        )

class AnswerRelevanceJudge(LLMJudge):
    def __init__(self, model="gpt-5-mini"):
        super().__init__(
            name="AnswerRelevance",
            prompt="""Evaluate how directly the answer addresses the user’s question.
            A score of 5 means that the generated response is relevant to the Human query. It answers the Human query.
            A score of 1 means that the generated response is irrelevant to the Human query. It is not helpful in answer the query.""",
            model=model
        )

## Example Usage

In [None]:
from langchain_openai import ChatOpenAI

judges = [
    ContextRelevanceJudge(),
    AnswerRelevanceJudge(),
    AnswerGroundednessJudge()
]

answer = "Mount Rainier is the highest mountain in Washington State."
context = "Mount Rainier is a stratovolcano located in the Cascade Range of Washington. It stands at 14,411 feet and is the most glaciated peak in the contiguous United States."
question = "Where is Mount Rainier located?"

for judge in judges:
    # Print the prompt for each judge
    prompt = judge.format_prompt(answer, context, question)
    print(f"\n--- {judge.name} Prompt ---\n{prompt}\n")

    # Run evaluation 
    result = judge.evaluate(answer, context, question)
    print(judge.name, "→", result)