In [1]:
import os


os.environ["LANGSMITH_API_KEY"]=os.getenv("LANGSMITH_API_KEY")
os.environ["GROQ_API_KEY"]=os.getenv("GROQ_API_KEY")
os.environ["LANGSMITH_TRACING"]="true" #so that langsmith can trace through the dataset and things happening

Creating a reference dataset

In [2]:
from langsmith import Client

client = Client()

# Define dataset: these are your test cases
dataset_name = "Chatbot Evaluation"
dataset = client.create_dataset(dataset_name)
client.create_examples(
    dataset_id=dataset.id,
    examples=[
        {
            "inputs": {"question": "What is LangChain?"},
            "outputs": {"answer": "A framework for building LLM applications"},
        },
        {
            "inputs": {"question": "What is LangSmith?"},
            "outputs": {"answer": "A platform for observing and evaluating LLM applications"},
        },
        {
            "inputs": {"question": "What is OpenAI?"},
            "outputs": {"answer": "A company that creates Large Language Models"},
        },
        {
            "inputs": {"question": "What is Google?"},
            "outputs": {"answer": "A technology company known for search"},
        },
        {
            "inputs": {"question": "What is Mistral?"},
            "outputs": {"answer": "A company that creates Large Language Models"},
        }
    ]
)

{'example_ids': ['54800205-e738-4505-b582-6dddb7670dd4',
  '8fcc5170-4ca5-4f4c-b97b-ccd886fe82ac',
  '09787139-a515-4c0f-9b06-e93cdd157cd2',
  '5018fdfc-50a2-41b0-ae87-4eec7871a0ee',
  '51f62c1d-4b6e-4fd9-96b3-550f5e11aee6'],
 'count': 5,
 'as_of': '2026-02-09T14:18:46.832669002Z'}

Defining metrics ( LLM as JUDGE )

In [3]:
from groq import Groq
from openai import OpenAI
from langsmith import wrappers
 
llm_client = wrappers.wrap_openai(
    OpenAI(
        api_key=os.environ["GROQ_API_KEY"],
        base_url="https://api.groq.com/openai/v1"
    )
)

eval_instructions = "You are an expert professor specialized in grading students' answers to questions.Be 100 percent sure with the decisions"

def correctness(inputs:dict,outputs:dict, reference_outputs:dict)->bool:
      user_content = f"""You are grading the following question:
    {inputs['question']}
    Here is the real answer:
    {reference_outputs['answer']}
    You are grading the following predicted answer:
    {outputs.get('response', '')}

    Just Respond with CORRECT or INCORRECT and don't reply with any other details:
    Grade:
    """
      response=llm_client.chat.completions.create(
            model="openai/gpt-oss-20b",
            temperature=0,
            messages=[
                  {"role":"system","content":eval_instructions},
                  {"role":"user","content":user_content}
            ]
      ).choices[0].message.content

      return response.strip().upper().startswith("CORRECT")


In [4]:
## Concisions- checks whether the actual output is less than 2x the length of the expected result.

def concision(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    return int(len(outputs.get("response", "")) < 2 * len(reference_outputs["answer"]))

Run Evaluation

In [5]:
default_instructions = "Respond to the users question in a short, concise manner (one short sentence)."
def my_app(question: str, model: str = "openai/gpt-oss-20b", instructions: str = default_instructions) -> str:
    return llm_client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[
            {"role": "system", "content": instructions},
            {"role": "user", "content": question},
        ],
    ).choices[0].message.content

In [6]:
### Call my_app for every datapoints
def ls_target(inputs: str) -> dict:
    return {"response": my_app(inputs["question"])}

In [8]:
experiment_results = client.evaluate(
    ls_target,          # your system under test
    data=dataset_name,  # LangSmith dataset
    evaluators=[correctness, concision],
    experiment_prefix="openai/gpt-oss-120b",
)


View the evaluation results for experiment: 'openai/gpt-oss-120b-18aa54a5' at:
https://smith.langchain.com/o/a6f4c2f1-2bbf-45fd-af95-0534b24925de/datasets/e6874c3e-b755-4886-a84a-1c9d3a15c532/compare?selectedSessions=a3977d9a-26e5-4cea-b5d8-b0fcc592b82d




5it [00:12,  2.43s/it]
