Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
245cf84
bring in MCP Prompts
gotsysdba Nov 20, 2025
f210bde
integrate mcp prompts
gotsysdba Nov 20, 2025
b985ab1
SelectAI Endpoint
gotsysdba Nov 20, 2025
87006a4
Reorg DB helm and OCI NSG
gotsysdba Nov 22, 2025
fe0a894
remove api/core
gotsysdba Nov 22, 2025
ddc97b6
Update tests
gotsysdba Nov 22, 2025
515a4cb
Fix Vector Search bug
gotsysdba Nov 22, 2025
5a5bc4c
Fix client bugs with integration tests
gotsysdba Nov 23, 2025
69621d7
completed client tests and bug fixes
gotsysdba Nov 23, 2025
d228bb8
Add standard ignores
gotsysdba Nov 23, 2025
0bde6fc
tests completed
gotsysdba Nov 23, 2025
e464a71
add mypy to test
gotsysdba Nov 23, 2025
bf7669e
Remove SelectAI
gotsysdba Nov 23, 2025
8898cb5
linted
gotsysdba Nov 23, 2025
6961010
simplify
gotsysdba Nov 23, 2025
95a40d9
pylint and pytest
gotsysdba Nov 23, 2025
90ba652
ignore mypy
gotsysdba Nov 23, 2025
6f268dd
Pylint all src code
gotsysdba Nov 23, 2025
b890785
include optimizer mcp tools
gotsysdba Nov 23, 2025
bd6cbae
Copy MCP utils
gotsysdba Nov 23, 2025
68a0a13
fix key error
gotsysdba Nov 23, 2025
f32d5ca
Cleanup
gotsysdba Nov 24, 2025
c77e3e1
Bring back LL selection; add test
gotsysdba Nov 24, 2025
745f2d3
fix additional bugs and create tests
gotsysdba Nov 24, 2025
a107a80
Fixed SQL error
ldemarchis Nov 24, 2025
6867997
Tests for SQL source error
gotsysdba Nov 24, 2025
b6c690f
Bump langchain-core
gotsysdba Nov 24, 2025
63e4d07
Merge @ViliTajnic fixes (partial)
gotsysdba Nov 24, 2025
87d1cbd
Closes #335 (add debugging to catch real error on next failure)
gotsysdba Nov 24, 2025
0a6acde
Closes #320
gotsysdba Nov 24, 2025
202ba8e
Fixup mcp code, resolve prompt bug
gotsysdba Nov 24, 2025
fc9a5af
remove debug
gotsysdba Nov 25, 2025
c4c4823
Settings Pytests fixup
gotsysdba Nov 25, 2025
a9a5e10
sort out streaming endpoints
gotsysdba Nov 25, 2025
ca97676
Add custom metric to allow prompt engineering on judging criteria.
gotsysdba Nov 25, 2025
bc1761c
Merge remote-tracking branch 'origin/main' into remove_select_ai
gotsysdba Nov 25, 2025
f64fe4c
Merge remote-tracking branch 'origin/main' into remove_select_ai
gotsysdba Nov 26, 2025
867d96f
Remove Demoware
gotsysdba Nov 26, 2025
82bba36
Merge remote-tracking branch 'origin/main' into remove_select_ai
gotsysdba Nov 26, 2025
bb0137a
Fix after recent behavioral change in FastAPI/Starlette. The HTTPBear…
gotsysdba Nov 26, 2025
29911a3
Merge branch 'remove_select_ai' into testbed-judge-prompt
gotsysdba Nov 26, 2025
b50f7cd
Merge remote-tracking branch 'origin/main' into testbed-judge-prompt
gotsysdba Nov 26, 2025
31133a9
Custom Judge
gotsysdba Nov 26, 2025
9d968da
Merge branch 'main' into testbed-judge-prompt
gotsysdba Nov 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 109 additions & 0 deletions src/server/api/utils/testbed_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
"""
Copyright (c) 2024, 2025, Oracle and/or its affiliates.
Licensed under the Universal Permissive License v1.0 as shown at http://oss.oracle.com/licenses/upl.

Custom metrics for testbed evaluation.

This module provides a customizable correctness metric for evaluating chatbot answers
against reference answers. Unlike Giskard's default CorrectnessMetric which has a
hardcoded prompt, this allows the system prompt to be configured via MCP prompts.
"""
# spell-checker:ignore giskard

from giskard.rag.metrics import CorrectnessMetric
from giskard.llm.client import ChatMessage, LLMClient, get_default_client
from giskard.llm.errors import LLMGenerationError
from giskard.rag.base import AgentAnswer
from giskard.rag.question_generators.utils import parse_json_output


def format_conversation(conversation: list[dict]) -> str:
"""Format conversation history for the evaluation prompt."""
return "\n\n".join([f"<{msg['role'].lower()}>{msg['content']}</{msg['role'].lower()}>" for msg in conversation])


CORRECTNESS_INPUT_TEMPLATE = """
### AGENT DESCRIPTION
{description}

### CONVERSATION
{conversation}

### AGENT ANSWER
{answer}

### EXPECTED ANSWER
{reference_answer}
"""


class CustomCorrectnessMetric(CorrectnessMetric): # pylint: disable=too-few-public-methods
"""Custom correctness metric with configurable system prompt."""

def __init__(
self,
name: str,
system_prompt: str,
llm_client: LLMClient = None,
agent_description: str = None,
):
"""Initialize the custom correctness metric.

Args:
name: The metric name (typically "correctness").
system_prompt: The system prompt for the judge LLM.
llm_client: Optional LLM client. If not provided, uses Giskard's default.
agent_description: Description of the agent being evaluated.
"""
# Call parent with name and llm_client only (CorrectnessMetric signature)
super().__init__(name=name, llm_client=llm_client)
self.system_prompt = system_prompt
self.agent_description = agent_description or "A chatbot answering questions."

def __call__(self, question_sample: dict, answer: AgentAnswer) -> dict:
"""Evaluate correctness of agent answer vs reference."""
llm_client = self._llm_client or get_default_client()
try:
out = llm_client.complete(
messages=[
ChatMessage(role="system", content=self.system_prompt),
ChatMessage(
role="user",
content=CORRECTNESS_INPUT_TEMPLATE.format(
conversation=format_conversation(
question_sample.conversation_history
+ [{"role": "user", "content": question_sample.question}]
),
answer=answer.message,
reference_answer=question_sample.reference_answer,
description=self.agent_description,
),
),
],
temperature=0,
format="json_object",
)

json_output = parse_json_output(
out.content,
llm_client=llm_client,
keys=["correctness", "correctness_reason"],
caller_id=self.__class__.__name__,
)

if "correctness" in json_output and not isinstance(json_output["correctness"], bool):
raise LLMGenerationError(
f"Error in correctness evaluation: {json_output['correctness']}. "
"Expected boolean value for 'correctness' key."
)

# Strip correctness_reason when correct (LLM sometimes includes it anyway)
if json_output.get("correctness") is True:
json_output.pop("correctness_reason", None)

return json_output

except LLMGenerationError:
raise
except Exception as err:
raise LLMGenerationError("Error while evaluating the agent") from err
44 changes: 32 additions & 12 deletions src/server/api/v1/testbed.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
"""
# spell-checker:ignore testsets testset giskard litellm

import asyncio
import pickle
import shutil

from datetime import datetime
import json
from typing import Optional
from giskard.rag import evaluate, QATestset
from giskard.rag.base import AgentAnswer
from giskard.llm import set_llm_model
from fastapi import APIRouter, HTTPException, Header, UploadFile
from fastapi.responses import JSONResponse
Expand All @@ -24,6 +24,8 @@
import server.api.utils.testbed as utils_testbed
import server.api.utils.databases as utils_databases
import server.api.utils.models as utils_models
from server.api.utils.testbed_metrics import CustomCorrectnessMetric
from server.mcp.prompts.defaults import get_prompt_with_override

from server.api.v1 import chat

Expand Down Expand Up @@ -229,26 +231,29 @@ async def testbed_generate_qa(
return testset_qa


async def _collect_testbed_answers(loaded_testset: QATestset, client: str) -> list[AgentAnswer]:
"""Collect answers from the chatbot for all questions in the testset."""
answers = []
for sample in loaded_testset.to_pandas().itertuples():
request = schema.ChatRequest(
messages=[ChatMessage(role="human", content=sample.question)],
)
ai_response = await chat.chat_post(client=client, request=request)
answers.append(AgentAnswer(message=ai_response["choices"][0]["message"]["content"]))
return answers


@auth.post(
"/evaluate",
description="Evaluate Q&A Test Set.",
response_model=schema.EvaluationReport,
)
def testbed_evaluate(
async def testbed_evaluate(
tid: schema.TestSetsIdType,
judge: str,
client: schema.ClientIdType = Header(default="server"),
) -> schema.EvaluationReport:
"""Run evaluate against a testset"""

def get_answer(question: str):
"""Submit question against the chatbot"""
request = schema.ChatRequest(
messages=[ChatMessage(role="human", content=question)],
)
ai_response = asyncio.run(chat.chat_post(client=client, request=request))
return ai_response["choices"][0]["message"]["content"]

evaluated = datetime.now().isoformat()
client_settings = utils_settings.get_client(client)
# Disable History
Expand All @@ -271,8 +276,23 @@ def get_answer(question: str):

judge_config = utils_models.get_litellm_config(model_config={"model": judge}, oci_config=oci_config, giskard=True)
set_llm_model(llm_model=judge, **judge_config)

# Get judge prompt from MCP (allows override via Prompt Engineering page)
judge_prompt_message = get_prompt_with_override("optimizer_testbed-judge")
judge_prompt = judge_prompt_message.content.text

# Create custom metric with the configurable prompt
custom_metric = CustomCorrectnessMetric(
name="correctness",
system_prompt=judge_prompt,
agent_description="A chatbot answering questions.",
)

# Pre-compute answers asynchronously to avoid event loop conflicts with LiteLLM
answers = await _collect_testbed_answers(loaded_testset, client)

try:
report = evaluate(get_answer, testset=loaded_testset, metrics=None)
report = evaluate(answers, testset=loaded_testset, metrics=[custom_metric])
except KeyError as ex:
if str(ex) == "'correctness'":
raise HTTPException(status_code=500, detail="Unable to determine the correctness; please retry.") from ex
Expand Down
58 changes: 57 additions & 1 deletion src/server/mcp/prompts/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
Copyright (c) 2024, 2025, Oracle and/or its affiliates.
Licensed under the Universal Permissive License v1.0 as shown at http://oss.oracle.com/licenses/upl.
"""
# spell-checker:ignore fastmcp giskard

# spell-checker:ignore fastmcp
from fastmcp.prompts.prompt import PromptMessage, TextContent
from server.mcp.prompts import cache

Expand Down Expand Up @@ -216,6 +216,51 @@ def optimizer_vs_rephrase() -> PromptMessage:
return PromptMessage(role="assistant", content=TextContent(type="text", text=clean_prompt_string(content)))


def optimizer_testbed_judge() -> PromptMessage:
"""Prompt for testbed evaluation judge.

Used to evaluate whether a chatbot's answer correctly matches the reference answer.
This prompt is more lenient than the default Giskard prompt - it allows additional
context in answers and only marks as incorrect when essential information is missing
or contradicted.
"""
content = """
You are evaluating whether an AI assistant correctly answered a question.

EVALUATION CRITERIA:
1. CORRECT if the agent's answer contains the essential information from the EXPECTED ANSWER
2. Additional context, elaboration, historical background, or helpful details beyond the expected answer should NOT be penalized
3. INCORRECT only if the agent's answer to the specific question asked contradicts or conflicts with the expected answer

Consider the answer CORRECT if:
- The core question is answered accurately with facts matching the expected answer
- The agent provides extra context, background, comparisons, or elaboration (this is GOOD)
- Additional information about related topics does not change the core answer

Consider the answer INCORRECT if:
- The direct answer to the question contradicts the expected answer
- Essential information from the expected answer is missing or wrong
- The agent admits it doesn't know or cannot answer

IMPORTANT: Additional context is NOT a contradiction. For example:
- Expected: "The new default is X"
- Agent says: "The new default is X. Previously it was Y."
- This is CORRECT - the agent answered the question correctly and added helpful context.

You will receive:
- AGENT DESCRIPTION: What the agent does
- CONVERSATION: The chat history
- AGENT ANSWER: What the agent responded
- EXPECTED ANSWER: The correct answer to compare against

Output ONLY valid JSON with no additional text:
- If correct: {"correctness": true}
- If incorrect: {"correctness": false, "correctness_reason": "brief explanation of what was wrong or missing"}
"""

return PromptMessage(role="assistant", content=TextContent(type="text", text=clean_prompt_string(content)))


# MCP Registration
async def register(mcp):
"""Register Out-of-Box Prompts"""
Expand Down Expand Up @@ -282,3 +327,14 @@ def rephrase_mcp() -> PromptMessage:
based on conversation history before performing retrieval.
"""
return get_prompt_with_override("optimizer_vs-rephrase")

@mcp.prompt(name="optimizer_testbed-judge", title="Testbed Judge Prompt", tags=optimizer_tags)
def testbed_judge_mcp() -> PromptMessage:
"""Prompt for testbed evaluation judge.

Used by the testbed to evaluate whether the chatbot's answer matches the reference.
Configurable to adjust evaluation strictness. The default prompt is lenient -
it allows additional context in answers and only fails on contradictions or
missing essential information.
"""
return get_prompt_with_override("optimizer_testbed-judge")
Loading