In [1]:
!which ipython

/Users/nori/Library/Caches/pypoetry/virtualenvs/langsmith-walkthrough-E7zp2F2A-py3.12/bin/ipython


In [2]:
!pip list | grep lang

langchain                 0.1.20
langchain-community       0.0.38
langchain-core            0.1.52
langchain-openai          0.1.7
langchain-text-splitters  0.0.2
langchainhub              0.1.20
langsmith                 0.1.84


### Choose Either of the following unique_id

In [34]:
from uuid import uuid4

unique_id = uuid4().hex[0:8]

In [19]:
unique_id = "be132a1e"

---

In [35]:
import os

os.environ["LANGCHAIN_PROJECT"] = f"Tracing Walkthrough - {unique_id}"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"

In [36]:
!env | grep LANGCHAIN_PROJECT

LANGCHAIN_PROJECT=Tracing Walkthrough - 6ddef5ef


In [6]:
from langsmith import Client

client = Client()

Failed to batch ingest runs: LangSmithConnectionError("Connection error caused failure to POST https://api.smith.langchain.com/runs/batch  in LangSmith API. Please confirm your internet connection.. ConnectionError(ProtocolError('Connection aborted.', TimeoutError('The write operation timed out')))")
Failed to batch ingest runs: LangSmithConnectionError("Connection error caused failure to POST https://api.smith.langchain.com/runs/batch  in LangSmith API. Please confirm your internet connection.. ConnectionError(ProtocolError('Connection aborted.', TimeoutError('The write operation timed out')))")
Failed to batch ingest runs: LangSmithConnectionError("Connection error caused failure to POST https://api.smith.langchain.com/runs/batch  in LangSmith API. Please confirm your internet connection.. ConnectionError(ProtocolError('Connection aborted.', TimeoutError('The write operation timed out')))")


### Choose Either tool of DuckDuckGo or Tavily

In [7]:
from langchain_community.tools import DuckDuckGoSearchResults

tools = [DuckDuckGoSearchResults(name="duck_duck_go")]

In [9]:
from langchain_community.tools.tavily_search import TavilySearchResults

tools = [TavilySearchResults(max_results=1)]

---

In [8]:
from langchain import hub
from langchain.agents import AgentExecutor
from langchain.agents.format_scratchpad.openai_tools import (
    format_to_openai_tool_messages,
)
from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser
from langchain_openai import ChatOpenAI
#from langchain_community.chat_models import ChatOpenAI

# Fetches the latest version of this prompt
prompt = hub.pull("wfh/langsmith-agent-prompt:5d466cbc")

llm = ChatOpenAI(
    model="gpt-3.5-turbo-16k",
    temperature=0,
)

llm_with_tools = llm.bind_tools(tools)

runnable_agent = (
    {
        "input": lambda x: x["input"],
        "agent_scratchpad": lambda x: format_to_openai_tool_messages(
            x["intermediate_steps"]
        ),
    }
    | prompt
    | llm_with_tools
    | OpenAIToolsAgentOutputParser()
)

agent_executor = AgentExecutor(
    agent=runnable_agent, tools=tools, handle_parsing_errors=True
)

In [37]:
inputs = [
    "What is LangChain?",
    "What's LangSmith?",
    "When was Llama-v2 released?",
    "What is the langsmith cookbook?",
    "When did langchain first announce the hub?",
    "Llama-v2 がリリースされた年の日本で最もヒットした曲のプロデューサーの初ジョクするプロダクションはなんて会社か。"
]

results = agent_executor.batch([{"input": x} for x in inputs], return_exceptions=True)

In [38]:
results

[{'input': 'What is LangChain?',
  'output': 'I\'m sorry, but I couldn\'t find any information about "LangChain". Could you please provide more context or clarify your question?'},
 {'input': "What's LangSmith?",
  'output': 'I\'m sorry, but I couldn\'t find any information about "LangSmith". It could be a company, a product, or a person. Can you provide more context or details about what you are referring to?'},
 {'input': 'When was Llama-v2 released?',
  'output': 'Llama-v2 was released in 2023.'},
 {'input': 'What is the langsmith cookbook?',
  'output': 'I\'m sorry, but I couldn\'t find any information about the "Langsmith Cookbook". It\'s possible that it may not be a well-known cookbook or it may not exist. Could you provide more context or clarify your question?'},
 {'input': 'When did langchain first announce the hub?',
  'output': 'LangChain first announced the LangChain Hub a few days ago.'},
 {'input': 'Llama-v2 がリリースされた年の日本で最もヒットした曲のプロデューサーの初ジョクするプロダクションはなんて会社か。',
  'output

In [39]:
outputs = [
    "LangChain is an open-source framework for building applications using large language models. It is also the name of the company building LangSmith.",
    "LangSmith is a unified platform for debugging, testing, and monitoring language model applications and agents powered by LangChain",
    "July 18, 2023",
    "The langsmith cookbook is a github repository containing detailed examples of how to use LangSmith to debug, evaluate, and monitor large language model-powered applications.",
    "September 5, 2023", 
    "??"
]

In [40]:
dataset_name = f"agent-qa-{unique_id}"

dataset = client.create_dataset(
    dataset_name,
    description="An example dataset of questions over the LangSmith documentation.",
)

client.create_examples(
    inputs=[{"input": query} for query in inputs],
    outputs=[{"output": answer} for answer in outputs],
    dataset_id=dataset.id,
)

In [41]:
from langchain import hub
from langchain.agents import AgentExecutor, AgentType, initialize_agent, load_tools
from langchain_openai import ChatOpenAI


# Since chains can be stateful (e.g. they can have memory), we provide
# a way to initialize a new chain for each row in the dataset. This is done
# by passing in a factory function that returns a new chain for each row.
def create_agent(prompt, llm_with_tools):
    runnable_agent = (
        {
            "input": lambda x: x["input"],
            "agent_scratchpad": lambda x: format_to_openai_tool_messages(
                x["intermediate_steps"]
            ),
        }
        | prompt
        | llm_with_tools
        | OpenAIToolsAgentOutputParser()
    )
    return AgentExecutor(agent=runnable_agent, tools=tools, handle_parsing_errors=True)

In [42]:
from langsmith.evaluation import EvaluationResult
from langsmith.schemas import Example, Run


def check_not_idk(run: Run, example: Example):
    """Illustration of a custom evaluator."""
    agent_response = run.outputs["output"]
    if "don't know" in agent_response or "not sure" in agent_response or "I couldn't find any information" in agent_response:
        score = 0
    else:
        score = 1
    # You can access the dataset labels in example.outputs[key]
    # You can also access the model inputs in run.inputs[key]
    return EvaluationResult(
        key="not_uncertain",
        score=score,
    )

In [43]:
from typing import List


def max_pred_length(runs: List[Run], examples: List[Example]):
    predictions = [len(run.outputs["output"]) for run in runs]
    return EvaluationResult(key="max_pred_length", score=max(predictions))

In [44]:
from langchain.evaluation import EvaluatorType
from langchain.smith import RunEvalConfig

evaluation_config = RunEvalConfig(
    # Evaluators can either be an evaluator type (e.g., "qa", "criteria", "embedding_distance", etc.) or a configuration for that evaluator
    evaluators=[
        check_not_idk,
        # Measures whether a QA response is "Correct", based on a reference answer
        # You can also select via the raw string "qa"
        EvaluatorType.QA,
        # Measure the embedding distance between the output and the reference answer
        # Equivalent to: EvalConfig.EmbeddingDistance(embeddings=OpenAIEmbeddings())
        EvaluatorType.EMBEDDING_DISTANCE,
        # Grade whether the output satisfies the stated criteria.
        # You can select a default one such as "helpfulness" or provide your own.
        RunEvalConfig.LabeledCriteria("helpfulness"),
        # The LabeledScoreString evaluator outputs a score on a scale from 1-10.
        # You can use default criteria or write our own rubric
        RunEvalConfig.LabeledScoreString(
            {
                "accuracy": """
Score 1: The answer is completely unrelated to the reference.
Score 3: The answer has minor relevance but does not align with the reference.
Score 5: The answer has moderate relevance but contains inaccuracies.
Score 7: The answer aligns with the reference but has minor errors or omissions.
Score 10: The answer is completely accurate and aligns perfectly with the reference."""
            },
            normalize_by=10,
        ),
    ],
    batch_evaluators=[max_pred_length],
)

In [45]:
from langchain import hub

# We will test this version of the prompt
prompt = hub.pull("wfh/langsmith-agent-prompt:798e7324")

In [49]:
import functools
from langchain.smith import arun_on_dataset, run_on_dataset

candidate_prompt = hub.pull("wfh/langsmith-agent-prompt:39f3bbd0")

chain_results = run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=functools.partial(
        create_agent, prompt=candidate_prompt, llm_with_tools=llm_with_tools
    ),
    evaluation=evaluation_config,
    verbose=True,
    client=client,
    project_name=f"tools-agent-test-5d466cbc--{unique_id}",
    # Project metadata communicates the experiment parameters,
    # Useful for reviewing the test results
    project_metadata={
        "env": "testing-notebook",
        "model": "gpt-3.5-turbo",
        "prompt": "5d466cbc",
    },
)

# Sometimes, the agent will error due to parsing issues, incompatible tool inputs, etc.
# These are logged as warnings here and captured as errors in the tracing UI.

View the evaluation results for project 'tools-agent-test-5d466cbc--6ddef5ef' at:
https://smith.langchain.com/o/7f14323a-b476-561c-87d3-67707c1e5011/datasets/f8e79a54-977a-4cea-83e2-fd524704f47d/compare?selectedSessions=88098e9a-0d67-45cb-96e9-d00c0de0207f

View all tests for Dataset agent-qa-6ddef5ef at:
https://smith.langchain.com/o/7f14323a-b476-561c-87d3-67707c1e5011/datasets/f8e79a54-977a-4cea-83e2-fd524704f47d
[------------------------------------------------->] 6/6

Unnamed: 0,feedback.not_uncertain,feedback.correctness,feedback.embedding_cosine_distance,feedback.helpfulness,feedback.score_string:accuracy,error,execution_time,run_id
count,6.0,6.0,6.0,6.0,6.0,0.0,6.0,6
unique,,,,,,0.0,,6
top,,,,,,,,cdeb2d3f-e244-4247-b747-6839ade7e1f7
freq,,,,,,,,1
mean,1.0,0.833333,0.146024,0.833333,0.716667,,6.49359,
std,0.0,0.408248,0.110686,0.408248,0.160208,,3.929096,
min,1.0,0.0,0.043855,0.0,0.5,,3.44836,
25%,1.0,1.0,0.049499,1.0,0.7,,4.222523,
50%,1.0,1.0,0.123061,1.0,0.7,,5.370922,
75%,1.0,1.0,0.2417,1.0,0.7,,6.427634,


In [47]:
chain_results.to_dataframe()

Unnamed: 0,inputs.input,outputs.input,outputs.output,reference.output,feedback.not_uncertain,feedback.correctness,feedback.embedding_cosine_distance,feedback.helpfulness,feedback.score_string:accuracy,error,execution_time,run_id
68d54632-a74c-4f37-ba05-4428db8eb5f6,What is LangChain?,What is LangChain?,LangChain is a decentralized blockchain platfo...,LangChain is an open-source framework for buil...,1,0,0.092569,0,0.1,,2.38143,c82aa00d-3981-42fe-a56f-5b13f796ae2f
2dc9c8d3-e640-4091-ac4b-0fada4bc39fb,What's LangSmith?,What's LangSmith?,LangSmith is a platform designed to streamline...,"LangSmith is a unified platform for debugging,...",1,1,0.061673,1,0.7,,4.788338,311812b0-5c20-44b3-9d6f-a23d26888a2b
0d0e2279-6b86-46c2-8e3c-ad1ed70aa29d,When was Llama-v2 released?,When was Llama-v2 released?,Llama-v2 was released in July 2023.,"July 18, 2023",1,1,0.163679,1,1.0,,4.87381,48c7ba14-73ff-4b4b-b2c9-de8b8110a905
bb66bf77-1b7c-47ce-8149-30b410a08d6d,What is the langsmith cookbook?,What is the langsmith cookbook?,The LangSmith Cookbook is a repository that se...,The langsmith cookbook is a github repository ...,1,1,0.054625,1,0.9,,4.970316,12d97884-125f-4f02-bd4f-64f7c7dfdfb3
77d82b84-7ca2-444c-92a0-a9890dc90f6d,When did langchain first announce the hub?,When did langchain first announce the hub?,LangChain first announced the LangChain Hub on...,"September 5, 2023",1,0,0.251983,0,0.3,,3.755413,a3352a6a-8ac6-4aef-b7dc-4db48c28441a
b21436ad-64f2-4ab5-b71d-7878b86ff8b2,Llama-v2 がリリースされた年の日本で最もヒットした曲のプロデューサーの初ジョクするプ...,Llama-v2 がリリースされた年の日本で最もヒットした曲のプロデューサーの初ジョクするプ...,Llama-v2のリリース年に日本で最もヒットした曲のプロデューサーの初ジョクするプロダクシ...,??,1,0,0.254821,1,0.5,,12.663458,29cf5c41-9f66-42f6-bbbd-2c4d80018075


In [51]:
runs = client.list_runs(project_name=chain_results["project_name"], execution_order=1)

In [52]:
runs

<generator object Client.list_runs at 0x112f9d3a0>

In [54]:
# The resulting tests are stored in a project.  You can programmatically
# access important metadata from the test, such as the dataset version it was run on
# or your application's revision ID.
client.read_project(project_name=chain_results["project_name"]).metadata

{'env': 'testing-notebook',
 'git': {'tags': None,
  'dirty': True,
  'branch': 'main',
  'commit': '384ad1f0de3ea68482747bdecba24a39bef0ab1e',
  'repo_name': 'llm',
  'remote_url': 'git@github.com:norisuke3/llm.git',
  'author_name': 'Noriaki Hamamoto',
  'commit_time': '1719940864',
  'author_email': 'norisuke3@gmail.com'},
 'model': 'gpt-3.5-turbo',
 'prompt': '5d466cbc',
 'revision_id': '384ad1f',
 'dataset_version': '2024-07-08T23:37:35.099693+00:00'}

In [55]:
# After some time, the test metrics will be populated as well.
client.read_project(project_name=chain_results["project_name"]).feedback_stats

{'correctness': {'n': 5, 'avg': 0.8, 'values': {'CORRECT': 4, 'INCORRECT': 1}},
 'embedding_cosine_distance': {'n': 5, 'avg': 0.1236, 'values': {}},
 'helpfulness': {'n': 5, 'avg': 0.8, 'values': {'N': 1, 'Y': 4}},
 'not_uncertain': {'n': 5, 'avg': 1.0, 'values': {}},
 'score_string:accuracy': {'n': 5, 'avg': 0.72, 'values': {}}}