# Run Evaluations Locally

This notebook demonstrates how to perform evaluations locally on your machine and upload the results to the [Lynxius Platform](https://platform.lynxius.ai/auth/signup).

With the local evaluation setup, you will need to manage the API keys for the models used in testing. Evaluation tasks will use your own compute resources and are blocking tasks. For a fully managed solution with non-blocking tasks, see [Run Evaluations Remotely](./eval_remotely.ipynb).

In [1]:
# First, we have to setup Lynxius API key
import os
import sys
from getpass import getpass
sys.path.append("../")

if not (lynxius_api_key := os.getenv("LYNXIUS_API_KEY")):
    lynxius_api_key = getpass("🔑 Enter your Lynxius API key: ")

os.environ["LYNXIUS_API_KEY"] = lynxius_api_key
os.environ["LYNXIUS_BASE_URL"] = "https://platform.lynxius.ai"

In [2]:
# Makes it easier to iterate
%load_ext autoreload
%autoreload 2

In [3]:
# We'll be using OpenAI to evaluate locally so we have to set the API key
if not (openai_api_key := os.getenv("OPENAI_API_KEY")):
    openai_api_key = getpass("🔑 Enter your OpenAI API key: ")
os.environ["OPENAI_API_KEY"] = openai_api_key

In [4]:
from lynxius.client import LynxiusClient

client = LynxiusClient()

# Download a dataset previously uploaded to the Lynxius Platform
dataset_details = client.get_dataset_details(dataset_id="7eff0d38-50ee-4b5d-a30d-cf428288016c")

In [5]:
# Our sample LLM application
from datasets_utils import chatdoctor_v1

# Importing the evaluators
from lynxius.evals.local.bert_score import BertScoreLocal
from lynxius.evals.local.answer_correctness import AnswerCorrectnessLocal
from lynxius.evals.local.semantic_similarity import SemanticSimilarityLocal
from lynxius.evals.local.custom_eval import CustomEvalLocal
from lynxius.evals.local.context_precision import ContextPrecisionLocal
from lynxius.evals.local.json_diff import JsonDiffLocal

# ContextChunk represents a document retrieved from you RAG system
from lynxius.rag.types import ContextChunk

In [6]:
# Here we define sample RAG contexts.
# Retrieval of context documents depends on the RAG database that you're using
context = [
    ContextChunk(document="Avoid close contact with people who are sick. When you are sick, keep your distance from others to protect them from getting sick, too.", relevance=0.75),
    ContextChunk(document="If possible, stay home from work, school, and errands when you’re sick. You can go back to your normal activities when, for at least 24 hours, both are true:", relevance=0.31)
]

# Define tags to make it easier to locate these eval runs on the Lynxius platform
tags = ["notebook", "experiment"]

In [7]:
# Lynxius allows you to use your own evaluator templates
# Let's define and use one!

# When using a custom template, the only thing that you need to ensure is that
# the final verdict is printed at the very bottom of the resonse, with no other characters.
custom_eval_template = """
You are given a question, a reference answer and a candidate answer concerning a clinical matter.
You must determine if the candidate answer covers exactly the same content as the reference answer.
If the candidate answer contains additional information, or fails to mention something that is present
in the reference answer, your verdict should be 'incorrect'. Otherwise, your verdict should be 'correct'.
Provide a short explanation about how you arrived to your verdict. The verdict must be printed at the
very bottom of your response, on a new line, and it must not contain any extra characters.
Here is the data:
***********
Query: {query}
***********
Reference answer: {reference}
***********
Candidate answer: {output}
"""

In [8]:
# Define and run the evals
bert_score = BertScoreLocal("PR #222", level="word", presence_threshold=0.65, tags=tags)
answer_correctness = AnswerCorrectnessLocal("PR #222", tags=tags)
semantic_similarity = SemanticSimilarityLocal("PR #222", tags=tags)
custom_eval = CustomEvalLocal("PR #222", name="clinical_correctness", prompt_template=custom_eval_template, tags=tags)
context_precision = ContextPrecisionLocal("PR #222", tags=tags)

for entry in dataset_details.entries:
    # Query your LLM
    actual_output = chatdoctor_v1(entry.query)

    # Add traces to the evals
    bert_score.add_trace(reference=entry.reference, output=actual_output, context=context)
    answer_correctness.add_trace(query=entry.query, reference=entry.reference, output=actual_output, context=context)
    semantic_similarity.add_trace(reference=entry.reference, output=actual_output, context=context)
    custom_eval.add_trace(values={"query": entry.query, "reference": entry.reference, "output": actual_output}, context=context)
    context_precision.add_trace(query=entry.query, reference=entry.reference, context=context)

# Run evals locally and store results in the Lynxius platform
client.evaluate(bert_score)
client.evaluate(answer_correctness)
client.evaluate(semantic_similarity)
client.evaluate(custom_eval)
client.evaluate(context_precision)

'74136cb0-16d5-4b02-b6b9-8928035c1f54'

In [9]:
# Define and run the JsonDiff eval
json_diff = JsonDiffLocal("PR #222", tags=tags)

ref = {
    "prop1": True,
    "prop2": 0.85,
    "prop3": [
      {
        "prop4": 0.92,
        "prop5": 0.71
      },
      {
        "prop4": 0.22,
        "prop5": 1.0
      }
    ]
}
output = {
    "prop1": False,
    "prop2": 0.71,
    "prop3": [
      {
        "prop4": 0.89,
        "prop5": 0.55
      },
      {
        "prop4": 0.34,
        "prop5": 0.97
      }
    ]
}
# Weights is an optional parameter. 
# If not provided, each field will have an equal contribution to the overall score of every nested object.
weights = {
    "prop1": 0.5,
    "prop2": 0.5,
    "prop3": 1.0, # Default weights is 1.0 but we can also set it explicitly
}

json_diff.add_trace(reference=ref, output=output, weights=weights, context=context)
client.evaluate(json_diff)

'db95ed95-e991-4ad6-8bd5-3ba328e0db8c'