# LangSmith Onboarding

Resources:

- Videos: [Getting Started with LangSmith](https://www.youtube.com/watch?v=Hab2CV_0hpQ)
- Github: [nhuang-lc/langsmith-onboarding](https://github.com/nhuang-lc/langsmith-onboarding)

In [1]:
from dotenv import load_dotenv

In [2]:
load_dotenv(override=True, dotenv_path="../.env")

True

In [3]:
import os
os.environ["LANGCHAIN_PROJECT"] = "langsmith-onboarding"

In [4]:
from langsmith import utils
utils.tracing_is_enabled()

True

In [5]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

In [6]:
def fake_db_retrieval():
    with open('langsmith-onboarding/polly_facts.txt', 'r') as file:
        polly_facts = file.read()
    return polly_facts

In [7]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a parrot named Polly! Here are some facts about yourself: {facts}\n Respond to questions about yourself based on those facts, and always repeat the user's question back before you respond."),
    ("user", "{question}")
])

chain = prompt | llm

question = "What sport are you the best at?"
chain.invoke({"question": question, "facts": fake_db_retrieval()})

AIMessage(content='What sport are you the best at? Polly likes playing soccer! But Polly is not very good at basketball because Polly does not have hands.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 29, 'prompt_tokens': 98, 'total_tokens': 127, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_0aa8d3e20b', 'finish_reason': 'stop', 'logprobs': None}, id='run-a55849c7-3217-4761-a149-d104a7e0d39a-0', usage_metadata={'input_tokens': 98, 'output_tokens': 29, 'total_tokens': 127, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

## Tracing and Prompts

In [8]:
from langsmith import traceable

@traceable(run_type="retriever")
def fake_db_retrieval_step(question):
    with open('langsmith-onboarding/polly_facts.txt', 'r') as file:
        polly_facts = file.read()
    return {"question": question, "facts": polly_facts}

In [9]:
from langchain import hub
#prompt = hub.pull("polly-prompt-1") # name of our saved prompt
# We can also pull a specific version of the prompt by appending the version hash
# If no version is specified, the latest version will be pulled
prompt = hub.pull("polly-prompt-1:97e2301d")

In [10]:
chain = fake_db_retrieval_step | prompt | llm

question = "What do you like to eat?"
chain.invoke(question)

AIMessage(content="Tu veux savoir ce que j'aime manger ? J'aime les biscuits pour animaux !", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 17, 'prompt_tokens': 102, 'total_tokens': 119, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_d02d531b47', 'finish_reason': 'stop', 'logprobs': None}, id='run-3d0d2cf5-c128-445c-a311-62f159426793-0', usage_metadata={'input_tokens': 102, 'output_tokens': 17, 'total_tokens': 119, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

## Datasets and Experiments

In [11]:
from langsmith import wrappers, Client
from pydantic import BaseModel, Field
from openai import OpenAI

client = Client()
openai_client = wrappers.wrap_openai(OpenAI())

### Create a Dataset: Optional

In [12]:
# For other dataset creation methods, see: 
# https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_programmatically 
# https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_in_application

# Create inputs and reference outputs
examples = [
  (
      "Which country is Mount Kilimanjaro located in?",
      "Mount Kilimanjaro is located in Tanzania.",
  ),
  (
      "What is Earth's lowest point?",
      "Earth's lowest point is The Dead Sea.",
  ),
]

inputs = [{"question": input_prompt} for input_prompt, _ in examples]
outputs = [{"answer": output_answer} for _, output_answer in examples]

# Programmatically create a dataset in LangSmith
dataset = client.create_dataset(
  dataset_name = "Sample dataset",
  description = "A sample dataset in LangSmith."
)

# Add examples to the dataset
client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)

### Define Target to Be Evaluated -- The Model

In [13]:
# Define the application logic you want to evaluate inside a target function
# The SDK will automatically send the inputs from the dataset to your target function
def target(inputs: dict) -> dict:
    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            { "role": "system", "content": "Answer the following question accurately" },
            { "role": "user", "content": inputs["question"] },
        ]
    )
    return { "response": response.choices[0].message.content.strip() }

### Define Evaluator

In [14]:
# Define instructions for the LLM judge evaluator
instructions = """Evaluate Student Answer against Ground Truth for conceptual similarity and classify true or false: 
- False: No conceptual match and similarity
- True: Most or full conceptual match and similarity
- Key criteria: Concept should match, not exact wording.
"""

# Define output schema for the LLM judge
class Grade(BaseModel):
    score: bool = Field(description="Boolean that indicates whether the response is accurate relative to the reference answer")

# Define LLM judge that grades the accuracy of the response relative to reference output
def accuracy(outputs: dict, reference_outputs: dict) -> bool:
  response = openai_client.beta.chat.completions.parse(
    model="gpt-4o-mini",
    messages=[
      { "role": "system", "content": instructions },
      { "role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]}; 
      Student's Answer: {outputs["response"]}"""
  }],
    response_format=Grade
  )
  return response.choices[0].message.parsed.score

### Run and View Results

In [15]:
# After running the evaluation, a link will be provided to view the results in langsmith
experiment_results = client.evaluate(
    target,
    data = "Sample dataset",
    evaluators = [
        accuracy,
        # can add multiple evaluators here
    ],
    experiment_prefix = "first-eval-in-langsmith",
    max_concurrency = 2,
)

View the evaluation results for experiment: 'first-eval-in-langsmith-750ae152' at:
https://smith.langchain.com/o/97513603-fff2-4730-b519-2b1aeeaae05d/datasets/9db3037f-1fd6-430d-9f06-cc9bb5f55e0a/compare?selectedSessions=d5dd803f-c8a3-4278-a725-9853285392ff




0it [00:00, ?it/s]

### Own Re-Implementation with Custom Dataset and Evaluation Logic

In [47]:
from langsmith import wrappers, Client
from pydantic import BaseModel, Field

client = Client()

In [48]:
## -- Dataset
import pandas as pd

# Load a custom dataset
df = pd.read_csv('qa_pairs_dummy.csv')

# Get the best answer for each question
idx = df.groupby("question_id")["answer_quality"].idxmax()
pair_ids = df.loc[idx, "pair_id"]
df_best = df.loc[df.pair_id.isin(pair_ids)]

# Extract QA pairs
qa_pairs = [(row['question_text'], row['answer_text']) for index, row in df_best.iterrows()]

# Create inputs and reference outputs
inputs = [{"question": input_prompt} for input_prompt, _ in qa_pairs]
outputs = [{"answer": output_answer} for _, output_answer in qa_pairs]

# Programmatically create a dataset in LangSmith
dataset = client.create_dataset(
    dataset_name = "dummy-qa-pairs-programmatic",
    description = "A programmatically uploaded dummy dataset."
)

# Add examples to the dataset
client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)

In [49]:
## -- Model
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

def fake_retrieval():
    with open('facts.txt', 'r') as file:
        polly_facts = file.read()
    return polly_facts

def target_model(inputs: dict) -> dict:
    llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are an AI Assistant. You are asked questions which you answer to the best of you knowledge. You need to consider some facts: {facts}\n Respond the questions you are asked based on those facts, and always repeat the user's question back before you respond."),
        ("user", "{question}")
    ])
    chain = prompt | llm

    response = chain.invoke({"question": inputs["question"], "facts": fake_retrieval()}).content

    return { "response": response }

In [55]:
## -- Evaluator
from openai import OpenAI

# Here, we use OpenAI as the evaluator, but we could use any other model
# even a local model via Ollama
openai_client = wrappers.wrap_openai(OpenAI())

# Define instructions for the LLM judge evaluator
instructions = """Evaluate Student Answer against Ground Truth for conceptual similarity and classify true or false: 
- False: No conceptual match and similarity
- True: Most or full conceptual match and similarity
- Key criteria: Concept should match, not exact wording.
"""

# Define output schema for the LLM judge
class BooleanGrade(BaseModel):
    score: bool = Field(description="Boolean that indicates whether the response is accurate relative to the reference answer")

# Define LLM judge that grades the accuracy of the response relative to reference output
# Here, we could replace the OpenAI evaluator with a custom LLM, even a local one (via Ollama)
def accuracy(outputs: dict, reference_outputs: dict) -> bool:
    response = openai_client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            { "role": "system", "content": instructions },
            { "role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]}; 
            Student's Answer: {outputs["response"]}"""
        }],
        response_format=BooleanGrade
    )
    return response.choices[0].message.parsed.score

# Define another evaluation function which works without an LLM judge
def answer_contains_question(outputs: dict, inputs: dict) -> bool:
    threshold: float = 0.5
    question_words = set(inputs["question"].split())
    response_words = set(outputs["response"].split())
    common_words = question_words.intersection(response_words)
    return len(common_words) / len(question_words) >= threshold

In [56]:
## -- Run Evaluation

# After running the evaluation, a link will be provided to view the results in langsmith
experiment_results = client.evaluate(
    target_model,
    data = "dummy-qa-pairs-programmatic",
    evaluators = [
        accuracy,
        answer_contains_question,
        # can add multiple evaluators here
    ],
    experiment_prefix = "dummy-qa-pairs-programmatic-experiment",
    max_concurrency = 2,
)

View the evaluation results for experiment: 'dummy-qa-pairs-programmatic-experiment-f49ab11a' at:
https://smith.langchain.com/o/97513603-fff2-4730-b519-2b1aeeaae05d/datasets/d498e188-0bff-4d66-b9b6-d6ec922dd6af/compare?selectedSessions=98d5cfd3-6bc6-49dd-93ed-7fa081214fc2




0it [00:00, ?it/s]