In [1]:
# First, we have to setup Lynxius API key
import os
import sys
from getpass import getpass
sys.path.append("../")

if not (lynxius_api_key := os.getenv("LYNXIUS_API_KEY")):
    lynxius_api_key = getpass("🔑 Enter your Lynxius API key: ")

os.environ["LYNXIUS_API_KEY"] = lynxius_api_key
os.environ["LYNXIUS_BASE_URL"] = "https://REQUEST-THE-ENDOINT-TO-GET-ACCESS"

🔑 Enter your Lynxius API key:  ········


In [2]:
# Makes it easier to iterate
%load_ext autoreload
%autoreload 2

In [3]:
from lynxius.client import LynxiusClient

client = LynxiusClient()

# Download the dataset (you can find the ID of your dataset in Lynxius online platform)
dataset_details = client.get_dataset_details(dataset_id="6e83cec5-d8d3-4237-af9e-8d4b7c71a2ce")

In [4]:
# Our sample LLM application
from datasets_utils import chatdoctor_v1

# Importing the evaluators
from lynxius.evals.bert_score import BertScore
from lynxius.evals.answer_correctness import AnswerCorrectness
from lynxius.evals.semantic_similarity import SemanticSimilarity
from lynxius.evals.custom_eval import CustomEval
from lynxius.evals.context_precision import ContextPrecision

# ContextChunk represents a document retrieved from you RAG system
from lynxius.rag.types import ContextChunk

In [5]:
# Here we define sample RAG contexts.
# Retrieval of context documents depends on the RAG database that you're using
context = [
    ContextChunk(document="Avoid close contact with people who are sick. When you are sick, keep your distance from others to protect them from getting sick, too.", relevance=0.75),
    ContextChunk(document="If possible, stay home from work, school, and errands when you’re sick. You can go back to your normal activities when, for at least 24 hours, both are true:", relevance=0.31)
]

In [6]:
# Lynxius allows you to use your own evaluator templates
# Let's define and use one!

# When using a cusotm template, the only thing that you need to ensure is that
# the final verdict is printed at the very bottom of the resonse, with no other characters.
custom_eval_template = """
You are given a question, a reference answer and a candidate answer concerning a clinical matter.
You must determine if the candidate answer covers exactly the same content as the reference answer.
If the candidate answer contains additional information, or fails to mention something that is present
in the reference answer, your verdict should be 'incorrect'. Otherwise, your verdict should be 'correct'.
Provide a short explanation about how you arrived to your verdict. The verdict must be printed at the
very bottom of your response, on a new line, and it must not contain any extra characters.
Here is the data:
***********
Query: {query}
***********
Reference answer: {reference}
***********
Candidate answer: {output}
"""

In [7]:
# Define and run the evals
bert_score = BertScore("PR #111", level="word", presence_threshold=0.65)
answer_correctness = AnswerCorrectness("PR #111")
semantic_similarity = SemanticSimilarity("PR #111")
custom_eval = CustomEval("PR #111", prompt_template=custom_eval_template)
context_precision = ContextPrecision("PR #111")

for entry in dataset_details.entries:
    # Query our LLM
    actual_output = chatdoctor_v1(entry.query)

    # Add traces to the evals
    bert_score.add_trace(reference=entry.reference, output=actual_output, context=context)
    answer_correctness.add_trace(query=entry.query, reference=entry.reference, output=actual_output, context=context)
    semantic_similarity.add_trace(reference=entry.reference, output=actual_output, context=context)
    custom_eval.add_trace(values={"query": entry.query, "reference": entry.reference, "output": actual_output}, context=context)
    context_precision.add_trace(query=entry.query, reference=entry.reference, context=context)

# Run!
client.evaluate(bert_score)
client.evaluate(answer_correctness)
client.evaluate(semantic_similarity)
client.evaluate(custom_eval)
client.evaluate(context_precision)

'd2c63a48-8f46-4b48-bc9e-d0ce2ef29f5e'