diff --git a/src/server/api/utils/testbed_metrics.py b/src/server/api/utils/testbed_metrics.py new file mode 100644 index 00000000..7cb4b77c --- /dev/null +++ b/src/server/api/utils/testbed_metrics.py @@ -0,0 +1,109 @@ +""" +Copyright (c) 2024, 2025, Oracle and/or its affiliates. +Licensed under the Universal Permissive License v1.0 as shown at http://oss.oracle.com/licenses/upl. + +Custom metrics for testbed evaluation. + +This module provides a customizable correctness metric for evaluating chatbot answers +against reference answers. Unlike Giskard's default CorrectnessMetric which has a +hardcoded prompt, this allows the system prompt to be configured via MCP prompts. +""" +# spell-checker:ignore giskard + +from giskard.rag.metrics import CorrectnessMetric +from giskard.llm.client import ChatMessage, LLMClient, get_default_client +from giskard.llm.errors import LLMGenerationError +from giskard.rag.base import AgentAnswer +from giskard.rag.question_generators.utils import parse_json_output + + +def format_conversation(conversation: list[dict]) -> str: + """Format conversation history for the evaluation prompt.""" + return "\n\n".join([f"<{msg['role'].lower()}>{msg['content']}" for msg in conversation]) + + +CORRECTNESS_INPUT_TEMPLATE = """ +### AGENT DESCRIPTION +{description} + +### CONVERSATION +{conversation} + +### AGENT ANSWER +{answer} + +### EXPECTED ANSWER +{reference_answer} +""" + + +class CustomCorrectnessMetric(CorrectnessMetric): # pylint: disable=too-few-public-methods + """Custom correctness metric with configurable system prompt.""" + + def __init__( + self, + name: str, + system_prompt: str, + llm_client: LLMClient = None, + agent_description: str = None, + ): + """Initialize the custom correctness metric. + + Args: + name: The metric name (typically "correctness"). + system_prompt: The system prompt for the judge LLM. + llm_client: Optional LLM client. If not provided, uses Giskard's default. + agent_description: Description of the agent being evaluated. + """ + # Call parent with name and llm_client only (CorrectnessMetric signature) + super().__init__(name=name, llm_client=llm_client) + self.system_prompt = system_prompt + self.agent_description = agent_description or "A chatbot answering questions." + + def __call__(self, question_sample: dict, answer: AgentAnswer) -> dict: + """Evaluate correctness of agent answer vs reference.""" + llm_client = self._llm_client or get_default_client() + try: + out = llm_client.complete( + messages=[ + ChatMessage(role="system", content=self.system_prompt), + ChatMessage( + role="user", + content=CORRECTNESS_INPUT_TEMPLATE.format( + conversation=format_conversation( + question_sample.conversation_history + + [{"role": "user", "content": question_sample.question}] + ), + answer=answer.message, + reference_answer=question_sample.reference_answer, + description=self.agent_description, + ), + ), + ], + temperature=0, + format="json_object", + ) + + json_output = parse_json_output( + out.content, + llm_client=llm_client, + keys=["correctness", "correctness_reason"], + caller_id=self.__class__.__name__, + ) + + if "correctness" in json_output and not isinstance(json_output["correctness"], bool): + raise LLMGenerationError( + f"Error in correctness evaluation: {json_output['correctness']}. " + "Expected boolean value for 'correctness' key." + ) + + # Strip correctness_reason when correct (LLM sometimes includes it anyway) + if json_output.get("correctness") is True: + json_output.pop("correctness_reason", None) + + return json_output + + except LLMGenerationError: + raise + except Exception as err: + raise LLMGenerationError("Error while evaluating the agent") from err diff --git a/src/server/api/v1/testbed.py b/src/server/api/v1/testbed.py index 93c84ad6..dfe0c869 100644 --- a/src/server/api/v1/testbed.py +++ b/src/server/api/v1/testbed.py @@ -4,7 +4,6 @@ """ # spell-checker:ignore testsets testset giskard litellm -import asyncio import pickle import shutil @@ -12,6 +11,7 @@ import json from typing import Optional from giskard.rag import evaluate, QATestset +from giskard.rag.base import AgentAnswer from giskard.llm import set_llm_model from fastapi import APIRouter, HTTPException, Header, UploadFile from fastapi.responses import JSONResponse @@ -24,6 +24,8 @@ import server.api.utils.testbed as utils_testbed import server.api.utils.databases as utils_databases import server.api.utils.models as utils_models +from server.api.utils.testbed_metrics import CustomCorrectnessMetric +from server.mcp.prompts.defaults import get_prompt_with_override from server.api.v1 import chat @@ -229,26 +231,29 @@ async def testbed_generate_qa( return testset_qa +async def _collect_testbed_answers(loaded_testset: QATestset, client: str) -> list[AgentAnswer]: + """Collect answers from the chatbot for all questions in the testset.""" + answers = [] + for sample in loaded_testset.to_pandas().itertuples(): + request = schema.ChatRequest( + messages=[ChatMessage(role="human", content=sample.question)], + ) + ai_response = await chat.chat_post(client=client, request=request) + answers.append(AgentAnswer(message=ai_response["choices"][0]["message"]["content"])) + return answers + + @auth.post( "/evaluate", description="Evaluate Q&A Test Set.", response_model=schema.EvaluationReport, ) -def testbed_evaluate( +async def testbed_evaluate( tid: schema.TestSetsIdType, judge: str, client: schema.ClientIdType = Header(default="server"), ) -> schema.EvaluationReport: """Run evaluate against a testset""" - - def get_answer(question: str): - """Submit question against the chatbot""" - request = schema.ChatRequest( - messages=[ChatMessage(role="human", content=question)], - ) - ai_response = asyncio.run(chat.chat_post(client=client, request=request)) - return ai_response["choices"][0]["message"]["content"] - evaluated = datetime.now().isoformat() client_settings = utils_settings.get_client(client) # Disable History @@ -271,8 +276,23 @@ def get_answer(question: str): judge_config = utils_models.get_litellm_config(model_config={"model": judge}, oci_config=oci_config, giskard=True) set_llm_model(llm_model=judge, **judge_config) + + # Get judge prompt from MCP (allows override via Prompt Engineering page) + judge_prompt_message = get_prompt_with_override("optimizer_testbed-judge") + judge_prompt = judge_prompt_message.content.text + + # Create custom metric with the configurable prompt + custom_metric = CustomCorrectnessMetric( + name="correctness", + system_prompt=judge_prompt, + agent_description="A chatbot answering questions.", + ) + + # Pre-compute answers asynchronously to avoid event loop conflicts with LiteLLM + answers = await _collect_testbed_answers(loaded_testset, client) + try: - report = evaluate(get_answer, testset=loaded_testset, metrics=None) + report = evaluate(answers, testset=loaded_testset, metrics=[custom_metric]) except KeyError as ex: if str(ex) == "'correctness'": raise HTTPException(status_code=500, detail="Unable to determine the correctness; please retry.") from ex diff --git a/src/server/mcp/prompts/defaults.py b/src/server/mcp/prompts/defaults.py index 753ab6fb..f56e8773 100644 --- a/src/server/mcp/prompts/defaults.py +++ b/src/server/mcp/prompts/defaults.py @@ -2,8 +2,8 @@ Copyright (c) 2024, 2025, Oracle and/or its affiliates. Licensed under the Universal Permissive License v1.0 as shown at http://oss.oracle.com/licenses/upl. """ +# spell-checker:ignore fastmcp giskard -# spell-checker:ignore fastmcp from fastmcp.prompts.prompt import PromptMessage, TextContent from server.mcp.prompts import cache @@ -216,6 +216,51 @@ def optimizer_vs_rephrase() -> PromptMessage: return PromptMessage(role="assistant", content=TextContent(type="text", text=clean_prompt_string(content))) +def optimizer_testbed_judge() -> PromptMessage: + """Prompt for testbed evaluation judge. + + Used to evaluate whether a chatbot's answer correctly matches the reference answer. + This prompt is more lenient than the default Giskard prompt - it allows additional + context in answers and only marks as incorrect when essential information is missing + or contradicted. + """ + content = """ + You are evaluating whether an AI assistant correctly answered a question. + + EVALUATION CRITERIA: + 1. CORRECT if the agent's answer contains the essential information from the EXPECTED ANSWER + 2. Additional context, elaboration, historical background, or helpful details beyond the expected answer should NOT be penalized + 3. INCORRECT only if the agent's answer to the specific question asked contradicts or conflicts with the expected answer + + Consider the answer CORRECT if: + - The core question is answered accurately with facts matching the expected answer + - The agent provides extra context, background, comparisons, or elaboration (this is GOOD) + - Additional information about related topics does not change the core answer + + Consider the answer INCORRECT if: + - The direct answer to the question contradicts the expected answer + - Essential information from the expected answer is missing or wrong + - The agent admits it doesn't know or cannot answer + + IMPORTANT: Additional context is NOT a contradiction. For example: + - Expected: "The new default is X" + - Agent says: "The new default is X. Previously it was Y." + - This is CORRECT - the agent answered the question correctly and added helpful context. + + You will receive: + - AGENT DESCRIPTION: What the agent does + - CONVERSATION: The chat history + - AGENT ANSWER: What the agent responded + - EXPECTED ANSWER: The correct answer to compare against + + Output ONLY valid JSON with no additional text: + - If correct: {"correctness": true} + - If incorrect: {"correctness": false, "correctness_reason": "brief explanation of what was wrong or missing"} + """ + + return PromptMessage(role="assistant", content=TextContent(type="text", text=clean_prompt_string(content))) + + # MCP Registration async def register(mcp): """Register Out-of-Box Prompts""" @@ -282,3 +327,14 @@ def rephrase_mcp() -> PromptMessage: based on conversation history before performing retrieval. """ return get_prompt_with_override("optimizer_vs-rephrase") + + @mcp.prompt(name="optimizer_testbed-judge", title="Testbed Judge Prompt", tags=optimizer_tags) + def testbed_judge_mcp() -> PromptMessage: + """Prompt for testbed evaluation judge. + + Used by the testbed to evaluate whether the chatbot's answer matches the reference. + Configurable to adjust evaluation strictness. The default prompt is lenient - + it allows additional context in answers and only fails on contradictions or + missing essential information. + """ + return get_prompt_with_override("optimizer_testbed-judge")