- Quickstart docs: https://docs.smith.langchain.com/evaluation/tutorials/evaluation
- Langsmith Evaluation Dashboards: https://smith.langchain.com/o/f602a986-7b0f-5e2b-aa00-74708f7af432/datasets/cab1b44f-92cb-4aa0-8b0e-55891dd4271f/compare?selectedSessions=8d9f3ca8-1784-4a3d-95af-75252aac219f%2C45f67b78-78b8-40dd-8134-aee86e870bfc&baseline=8d9f3ca8-1784-4a3d-95af-75252aac219f

In [3]:
import os
from typing import List, Dict, Any
from dotenv import load_dotenv
from langsmith.evaluation import evaluate
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from pydantic import BaseModel, Field
from langchain_groq import ChatGroq
from langsmith import wrappers, Client
from pydantic import BaseModel, Field
from langchain_groq import ChatGroq
# Load environment variables
load_dotenv()

LANGCHAIN_API_KEY = os.environ.get("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY
os.environ["LANGCHAIN_PROJECT"] = "evaluation_demo"


In [2]:

client = Client()

# Create inputs and reference outputs
examples = [
    # Geography
    (
        "Which country is Mount Kilimanjaro located in?",
        "Mount Kilimanjaro is located in Tanzania, a country in East Africa known for its diverse wildlife and stunning landscapes."
    ),
    (
        "What is Earth's lowest point?",
        "Earth's lowest point is the Dead Sea, located at the border between Jordan and Israel, with a surface elevation of about 430 meters below sea level."
    ),
    # Science
    (
        "What is the chemical symbol for water?",
        "The chemical symbol for water is H₂O, which represents two hydrogen atoms bonded to one oxygen atom."
    ),
    (
        "Who developed the theory of relativity?",
        "The theory of relativity, which revolutionized our understanding of space, time, and gravity, was developed by Albert Einstein in the early 20th century."
    ),
    # History
    (
        "Who was the first President of the United States?",
        "The first President of the United States was George Washington, who served from 1789 to 1797 and played a key role in leading the country during its formative years."
    ),
    (
        "When did World War II end?",
        "World War II ended in 1945, marking the conclusion of one of the most significant and devastating conflicts in human history."
    ),
    # Literature
    (
        "Who wrote 'Pride and Prejudice'?",
        "'Pride and Prejudice' was written by Jane Austen, a renowned English novelist known for her insightful commentary on society and human relationships."
    ),
    (
        "What is the title of Shakespeare's play about the Prince of Denmark?",
        "The title of Shakespeare's play about the Prince of Denmark is 'Hamlet', a tragedy that explores themes of revenge, madness, and morality."
    ),
    # Technology
    (
        "What does CPU stand for in computing?",
        "CPU stands for Central Processing Unit, which is the primary component of a computer responsible for executing instructions and performing calculations."
    ),
    (
        "Who is considered the founder of Microsoft?",
        "Microsoft was founded by Bill Gates and Paul Allen in 1975, and it has since become one of the leading technology companies in the world."
    ),
    # General Knowledge
    (
        "What is the capital of Japan?",
        "The capital of Japan is Tokyo, a bustling metropolis that is the political, economic, and cultural center of the country."
    ),
    (
        "Which planet is known as the Red Planet?",
        "Mars is known as the Red Planet due to its reddish appearance caused by iron oxide, or rust, on its surface."
    ),
]


inputs = [{"question": input_prompt} for input_prompt, _ in examples]
outputs = [{"answer": output_answer} for _, output_answer in examples]

# Programmatically create a dataset in LangSmith
dataset = client.create_dataset(
  dataset_name="Sample dataset", description="A sample dataset in LangSmith."
)

# Add examples to the dataset
client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)


In [12]:
# Define the application logic you want to evaluate inside a target function
# The SDK will automatically send the inputs from the dataset to your target function
def target(inputs):
  
    llm = ChatGroq(model="mixtral-8x7b-32768")
    res = llm.invoke(inputs)

    return res.content


In [10]:
# Define output schema for the LLM judge
class Grade(BaseModel):
    score: bool = Field(
        description="Boolean that indicates whether the response is accurate relative to the reference answer"
    )

def accuracy(outputs: dict, reference_outputs: dict) -> bool:
    # Set up a parser + inject instructions into the prompt template.
    parser = JsonOutputParser(pydantic_object=Grade)
    model = ChatGroq(model="mixtral-8x7b-32768")
    prompt = PromptTemplate(
        template=f"""Evaluate Student Answer against Ground Truth for conceptual similarity and classify true or false: 
        - False: No conceptual match and similarity
        - True: Most or full conceptual match and similarity
        - Key criteria: Concept should match, not exact wording.

        Ground Truth answer: {reference_outputs}
        Student's Answer: {outputs}
        """,
        input_variables=["query"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )

    chain = prompt | model | parser

    return chain.invoke()




# print(experiment_results)

In [None]:
# After running the evaluation, a link will be provided to view the results in langsmith
experiment_results = evaluate(
  target,
  data="Sample dataset",
  evaluators=[
      accuracy,
      # can add multiple evaluators here
  ],
  experiment_prefix="first-eval-in-langsmith",
  max_concurrency=2,
)


## Import Libraries

In [13]:
import os
from typing import List, Dict, Any
from dotenv import load_dotenv
from langsmith.evaluation import evaluate
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from pydantic import BaseModel, Field
from langchain_groq import ChatGroq
from langsmith import wrappers, Client
from pydantic import BaseModel, Field
from langchain_groq import ChatGroq
# Load environment variables
load_dotenv()

LANGCHAIN_API_KEY = os.environ.get("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY
os.environ["LANGCHAIN_PROJECT"] = "evaluation_demo"

## Create a dataset

In [None]:
from langsmith import Client

# Initialize LangSmith client
client = Client()

# Define dataset: these are your test cases
dataset_name = "QA Example Dataset"
dataset = client.create_dataset(dataset_name)

# Define inputs and outputs based on your examples
inputs = [
    {"question": "Which country is Mount Kilimanjaro located in?"},
    {"question": "What is the chemical symbol for water?"},
    {"question": "Who developed the theory of relativity?"},
    {"question": "Who was the first President of the United States?"},
    {"question": "When did World War II end?"},
    {"question": "Who wrote 'Pride and Prejudice'?"},
    {"question": "What is the title of Shakespeare's play about the Prince of Denmark?"},
    {"question": "Who is considered the founder of Microsoft?"},
]

outputs = [
    {"answer": "Mount Kilimanjaro is located in Tanzania, a country in East Africa known for its diverse wildlife and stunning landscapes."},
    {"answer": "The chemical symbol for water is H₂O, which represents two hydrogen atoms bonded to one oxygen atom."},
    {"answer": "The theory of relativity, which revolutionized our understanding of space, time, and gravity, was developed by Albert Einstein in the early 20th century."},
    {"answer": "The first President of the United States was George Washington, who served from 1789 to 1797 and played a key role in leading the country during its formative years."},
    {"answer": "World War II ended in 1945, marking the conclusion of one of the most significant and devastating conflicts in human history."},
    {"answer": "'Pride and Prejudice' was written by Jane Austen, a renowned English novelist known for her insightful commentary on society and human relationships."},
    {"answer": "The title of Shakespeare's play about the Prince of Denmark is 'Hamlet', a tragedy that explores themes of revenge, madness, and morality."},
    {"answer": "Microsoft was founded by Bill Gates and Paul Allen in 1975, and it has since become one of the leading technology companies in the world."},
]

# Add examples to the dataset
client.create_examples(
    inputs=inputs,
    outputs=outputs,
    dataset_id=dataset.id,
)


## Define correctness Evaluator

In [14]:
from langchain_groq import ChatGroq
from langchain_core.prompts.prompt import PromptTemplate
from langsmith.evaluation import LangChainStringEvaluator

_PROMPT_TEMPLATE = """You are an expert professor specialized in grading students' answers to questions.
You are grading the following question:
{query}
Here is the real answer:
{answer}
You are grading the following predicted answer:
{result}
Respond with CORRECT or INCORRECT:
Grade:
"""

PROMPT = PromptTemplate(
    input_variables=["query", "answer", "result"], template=_PROMPT_TEMPLATE
)
eval_llm = ChatGroq(model="llama-3.3-70b-versatile")

qa_evaluator = LangChainStringEvaluator("qa", config={"llm": eval_llm, "prompt": PROMPT})



In [15]:
from langsmith.schemas import Run, Example

def evaluate_length(run: Run, example: Example) -> dict:
    prediction = run.outputs.get("output") or ""
    required = example.outputs.get("answer") or ""
    score = int(len(prediction) < 2 * len(required))
    return {"key":"length", "score": score}

## Check Hallucinate Evaluator

In [26]:
from langchain.chat_models import init_chat_model
from langsmith.schemas import Run
from pydantic import BaseModel, Field

# Data model
class GradeHallucinations(BaseModel):
    """Binary score for hallucination present in generation answer."""

    is_grounded: bool = Field(..., description="True if the answer is grounded in the facts, False otherwise.")

# LLM with structured outputs for grading hallucinations
# For more see: https://python.langchain.com/docs/how_to/structured_output/
grader_llm= init_chat_model(model = "gemma2-9b-it",model_provider="groq", temperature=0).with_structured_output(
    GradeHallucinations,
    method="json_mode",
    # strict=True,
)

def no_hallucination(run: Run) -> bool:
    """Check if the answer is grounded in the documents.

    Return True if there is no hallucination, False otherwise.
    """
    # Get documents and answer
    qa_pipeline_run = next(
        r for r in run.child_runs if r.name == "qa_pipeline"
    )
    retrieve_run = next(
        r for r in qa_pipeline_run.child_runs if r.name == "retrieve"
    )
    retrieved_content = "\n\n".join(
        doc["page_content"] for doc in retrieve_run.outputs["output"]
    )

    # Construct prompt
    instructions = (
        "You are a grader assessing whether an LLM generation is grounded in / "
        "supported by a set of retrieved facts. Give a binary score 1 or 0, "
        "where 1 means that the answer is grounded in / supported by the set of facts."
    )

    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": f"Set of facts:\n{retrieved_content}\n\nLLM generation: {run.outputs['answer']}"},
    ]

    grade = grader_llm.invoke(messages)
    return grade.is_grounded

## Run Evaluations

In [27]:
def target(inputs):
  
    llm = ChatGroq(model="mixtral-8x7b-32768")
    res = llm.invoke(inputs)

    return res.content

In [28]:
def langsmith_app(inputs):
    # output = my_app(inputs["question"])
    llm = ChatGroq(model="mixtral-8x7b-32768")
    output = llm.invoke(inputs["question"])
    return {"output": output.content}

In [31]:
from langsmith import evaluate
dataset_name = "QA Example Dataset"

experiment_results = evaluate(
    langsmith_app, # Your AI system
    data=dataset_name, # The data to predict and grade over
    evaluators=[evaluate_length, qa_evaluator], # The evaluators to score the results
    # experiment_prefix="openai-3.5", # A prefix for your experiment names to easily identify them
)



View the evaluation results for experiment: 'cold-volcano-93' at:
https://smith.langchain.com/o/f602a986-7b0f-5e2b-aa00-74708f7af432/datasets/cab1b44f-92cb-4aa0-8b0e-55891dd4271f/compare?selectedSessions=92f7f5e6-6dd6-486e-b699-fecd92707f75




8it [01:28, 11.11s/it]


In [32]:
experiment_results

Unnamed: 0,inputs.question,outputs.output,error,reference.answer,feedback.length,feedback.correctness,execution_time,example_id,id
0,Who developed the theory of relativity?,Albert Einstein developed the theory of relati...,,"The theory of relativity, which revolutionized...",1,1,45.40947,84e6e56b-ecf6-45ef-b84d-c511e2d5f01d,922b10d5-048b-4c6e-80c2-b4b642752448
1,Which country is Mount Kilimanjaro located in?,"Mount Kilimanjaro is located in Tanzania, a co...",,"Mount Kilimanjaro is located in Tanzania, a co...",0,1,45.435957,9b554245-b960-4e2b-ba36-2d62a8c6d9ca,0283f4fa-4702-478a-abe2-c66d9d79347b
2,What is the chemical symbol for water?,"The chemical formula for water is H2O, not a s...",,"The chemical symbol for water is H₂O, which re...",0,1,45.393841,e7a92e73-b60a-4df8-9d12-73201802089b,8f0fba22-fe0f-41f5-9b27-0f4a9520d852
3,Who is considered the founder of Microsoft?,"Bill Gates, along with Paul Allen, is consider...",,Microsoft was founded by Bill Gates and Paul A...,0,1,45.514016,2c69f474-66c2-45b6-90d8-9136fe396cf9,f5100ac1-7f72-4412-a8a0-960d3a24e9d2
4,When did World War II end?,"World War II ended officially on September 2, ...",,"World War II ended in 1945, marking the conclu...",0,1,45.51249,16e151ca-0386-45be-b4b6-05998ab1d91e,0a9a1490-de52-4bfd-92c1-d57cb0c72e9b
5,Who wrote 'Pride and Prejudice'?,"The author of ""Pride and Prejudice"" is Jane Au...",,'Pride and Prejudice' was written by Jane Aust...,0,1,45.559891,105ac9cc-92f8-47e4-9218-688aa289871b,0218ae01-0d77-45d2-bc89-024af5c72062
6,Who was the first President of the United States?,George Washington was the first President of t...,,The first President of the United States was G...,0,1,45.827198,25e29272-fe72-4c44-9604-4ae4a05064f7,9f0b1b1d-d597-4ebd-b1cc-c6f0942aa67b
7,What is the title of Shakespeare's play about ...,The title of the Shakespeare play you're refer...,,The title of Shakespeare's play about the Prin...,0,1,45.811562,bd818574-e76c-4860-9e7a-4a52854a7569,19315416-0734-419b-9e19-f59dc2e244bd
