https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/docs/langsmith/walkthrough.ipynb#scrollTo=904db9a5-f387-4a57-914c-c8af8d39e249

In [9]:
import os
from uuid import uuid4
from langchain_core.messages import HumanMessage
from langchain_openai import AzureChatOpenAI
from langchain.callbacks import get_openai_callback
from langsmith import Client
from langchain import hub
from langchain.agents import AgentExecutor
from langchain.agents.format_scratchpad.openai_tools import (
    format_to_openai_tool_messages,
)
from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser
from langchain_community.tools import DuckDuckGoSearchResults

unique_id = uuid4().hex[0:8]
os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_API_KEY"]=""
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = f"Tracing Walkthrough - {unique_id}"
os.environ["AZURE_OPENAI_API_KEY"] = ""
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://<>.openai.azure.com/"
os.environ["OPENAI_API_VERSION"] = "2023-05-15"

client = Client()

In [7]:
llm = AzureChatOpenAI(
    openai_api_version="2023-05-15",
    azure_deployment="gpt-4",
    temperature=0,
)

In [8]:
message = HumanMessage(
    content="Translate this sentence from English to French. I love programming."
)
llm([message])

with get_openai_callback() as cb:
    llm([message])
    print(
        f"Total Cost (USD): ${format(cb.total_cost, '.6f')}"
    )

Total Cost (USD): $0.000990


In [10]:
model0125 = AzureChatOpenAI(
    openai_api_version="2023-05-15",
    deployment_name="gpt-4",
    model_version="0125-Preview",
)
with get_openai_callback() as cb:
    model0125([message])
    print(f"Total Cost (USD): ${format(cb.total_cost, '.6f')}")

Total Cost (USD): $0.000400


# Log runs to LangSmith

In [12]:
prompt = hub.pull("wfh/langsmith-agent-prompt:5d466cbc")

llm = AzureChatOpenAI(
    openai_api_version="2023-05-15",
    azure_deployment="gpt-4",
    temperature=0,
)

tools = [
    DuckDuckGoSearchResults(
        name="duck_duck_go"
    ),  # General internet search using DuckDuckGo
]
llm_with_tools = llm.bind_tools(tools)

runnable_agent = (
    {
        "input": lambda x: x["input"],
        "agent_scratchpad": lambda x: format_to_openai_tool_messages(
            x["intermediate_steps"]
        ),
    }
    | prompt
    | llm_with_tools
    | OpenAIToolsAgentOutputParser()
)

agent_executor = AgentExecutor(
    agent=runnable_agent, tools=tools, handle_parsing_errors=True
)

In [13]:
inputs = [
    "What is LangChain?",
    "What's LangSmith?",
    "When was Llama-v2 released?",
    "What is the langsmith cookbook?",
    "When did langchain first announce the hub?",
]

results = agent_executor.batch([{"input": x} for x in inputs], return_exceptions=True)

In [14]:
results[:2]

[openai.RateLimitError("Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2023-05-15 have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 10 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.'}}"),
 openai.RateLimitError("Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2023-05-15 have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 10 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.'}}")]

# Evaluate Agent

In addition to logging runs, LangSmith also allows you to test and evaluate your LLM applications.

In this section, you will leverage LangSmith to create a benchmark dataset and run AI-assisted evaluators on an agent. You will do so in a few steps:

1. Create a dataset
2. Initialize a new agent to benchmark
3. Configure evaluators to grade an agent's output
4. Run the agent over the dataset and evaluate the results

1. Create a LangSmith dataset

In [15]:
outputs = [
    "LangChain is an open-source framework for building applications using large language models. It is also the name of the company building LangSmith.",
    "LangSmith is a unified platform for debugging, testing, and monitoring language model applications and agents powered by LangChain",
    "July 18, 2023",
    "The langsmith cookbook is a github repository containing detailed examples of how to use LangSmith to debug, evaluate, and monitor large language model-powered applications.",
    "September 5, 2023",
]
dataset_name = f"agent-qa-{unique_id}"

dataset = client.create_dataset(
    dataset_name,
    description="An example dataset of questions over the LangSmith documentation.",
)

client.create_examples(
    inputs=[{"input": query} for query in inputs],
    outputs=[{"output": answer} for answer in outputs],
    dataset_id=dataset.id,
)

2. Initialize a new agent to benchmark

LangSmith lets you evaluate any LLM, chain, agent, or even a custom function. Conversational agents are stateful (they have memory); to ensure that this state isn't shared between dataset runs, we will pass in a chain_factory (aka a constructor) function to initialize for each call.

In [16]:
from langchain import hub
from langchain.agents import AgentExecutor, AgentType, initialize_agent, load_tools
from langchain_openai import ChatOpenAI


# Since chains can be stateful (e.g. they can have memory), we provide
# a way to initialize a new chain for each row in the dataset. This is done
# by passing in a factory function that returns a new chain for each row.
def create_agent(prompt, llm_with_tools):
    runnable_agent = (
        {
            "input": lambda x: x["input"],
            "agent_scratchpad": lambda x: format_to_openai_tool_messages(
                x["intermediate_steps"]
            ),
        }
        | prompt
        | llm_with_tools
        | OpenAIToolsAgentOutputParser()
    )
    return AgentExecutor(agent=runnable_agent, tools=tools, handle_parsing_errors=True)

3. Configure evaluation

Manually comparing the results of chains in the UI is effective, but it can be time consuming. It can be helpful to use automated metrics and AI-assisted feedback to evaluate your component's performance.

In [17]:
from langsmith.evaluation import EvaluationResult
from langsmith.schemas import Example, Run


def check_not_idk(run: Run, example: Example):
    """Illustration of a custom evaluator."""
    agent_response = run.outputs["output"]
    if "don't know" in agent_response or "not sure" in agent_response:
        score = 0
    else:
        score = 1
    # You can access the dataset labels in example.outputs[key]
    # You can also access the model inputs in run.inputs[key]
    return EvaluationResult(
        key="not_uncertain",
        score=score,
    )

Batch Evaluators

Some metrics are aggregated over a full "test" without being assigned to an individual runs/examples. These could be as simple as common classification metrics like Precision, Recall, or AUC, or it could be another custom aggregate metric.

You can define any batch metric on a full test level by defining a function (or any callable) that accepts a list of Runs (system traces) and list of Examples (dataset records).

In [18]:
from typing import List


def max_pred_length(runs: List[Run], examples: List[Example]):
    predictions = [len(run.outputs["output"]) for run in runs]
    return EvaluationResult(key="max_pred_length", score=max(predictions))

Below, we will configure the evaluation with the custom evaluator from above, as well as some pre-implemented run evaluators that do the following:

* Compare results against ground truth labels.
* Measure semantic (dis)similarity using embedding distance
* Evaluate 'aspects' of the agent's response in a reference-free manner using custom criteria

In [19]:
from langchain.evaluation import EvaluatorType
from langchain.smith import RunEvalConfig

evaluation_config = RunEvalConfig(
    # Evaluators can either be an evaluator type (e.g., "qa", "criteria", "embedding_distance", etc.) or a configuration for that evaluator
    evaluators=[
        check_not_idk,
        # Measures whether a QA response is "Correct", based on a reference answer
        # You can also select via the raw string "qa"
        EvaluatorType.QA,
        # Measure the embedding distance between the output and the reference answer
        # Equivalent to: EvalConfig.EmbeddingDistance(embeddings=OpenAIEmbeddings())
        EvaluatorType.EMBEDDING_DISTANCE,
        # Grade whether the output satisfies the stated criteria.
        # You can select a default one such as "helpfulness" or provide your own.
        RunEvalConfig.LabeledCriteria("helpfulness"),
        # The LabeledScoreString evaluator outputs a score on a scale from 1-10.
        # You can use default criteria or write our own rubric
        RunEvalConfig.LabeledScoreString(
            {
                "accuracy": """
Score 1: The answer is completely unrelated to the reference.
Score 3: The answer has minor relevance but does not align with the reference.
Score 5: The answer has moderate relevance but contains inaccuracies.
Score 7: The answer aligns with the reference but has minor errors or omissions.
Score 10: The answer is completely accurate and aligns perfectly with the reference."""
            },
            normalize_by=10,
        ),
    ],
    batch_evaluators=[max_pred_length],
)

4. Run the agent and evaluators

Use the run_on_dataset) function to evaluate your model. This will:

1. Fetch example rows from the specified dataset.
2. Run your agent (or any custom function) on each example.
3. Apply evaluators to the resulting run traces and corresponding reference examples to generate automated feedback.

In [20]:
from langchain import hub

# We will test this version of the prompt
prompt = hub.pull("wfh/langsmith-agent-prompt:798e7324")

In [22]:
# import functools

# from langchain.smith import arun_on_dataset, run_on_dataset

# chain_results = run_on_dataset(
#     dataset_name=dataset_name,
#     llm_or_chain_factory=functools.partial(
#         create_agent, prompt=prompt, llm_with_tools=llm_with_tools
#     ),
#     evaluation=evaluation_config,
#     verbose=True,
#     client=client,
#     project_name=f"tools-agent-test-5d466cbc-{unique_id}",
#     # Project metadata communicates the experiment parameters,
#     # Useful for reviewing the test results
#     project_metadata={
#         "env": "testing-notebook",
#         "model": "gpt-4",
#         "prompt": "5d466cbc",
#     },
# )

# # Sometimes, the agent will error due to parsing issues, incompatible tool inputs, etc.
# # These are logged as warnings here and captured as errors in the tracing UI.