## String and comparison evaluation

In [8]:
! pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.14.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m10.9 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.1


In [3]:
### LangSmith (tracing) — opcjonalnie
# włącz śledzenie (jeśli masz konto)
# Opcjonalnie (wymaga konta):
# os.environ["LANGSMITH_TRACING"] = "true"
# os.environ["LANGSMITH_API_KEY"] = "<TWÓJ_KLUCZ>"
# os.environ["LANGSMITH_PROJECT"] = "kurs-demo"
print("LangSmith: ustaw zmienne środowiskowe, aby włączyć tracing.")

LangSmith: ustaw zmienne środowiskowe, aby włączyć tracing.


### Embedding Distance Evaluator

In [6]:
from langchain.evaluation import load_evaluator
from dotenv import load_dotenv

load_dotenv()

evaluator = load_evaluator("embedding_distance", embeddings_model="openai")

result = evaluator.evaluate_strings(
    prediction="Stolica Polski to Warszawa",
    reference="Warszawa jest stolicą Polski"
)

print(result)


{'score': 0.055613485077632974}


### String Comparison Evaluator

In [9]:
evaluator = load_evaluator("string_distance", metric="bleu")

result = evaluator.evaluate_strings(
    prediction="Warsaw is the capital of Poland",
    reference="The capital of Poland is Warsaw"
)

print(result)


{'score': 0.28903225806451616}


In [10]:
from langchain.evaluation import load_evaluator

# BLEU evaluator
bleu_eval = load_evaluator("string_distance", metric="bleu")

result_bleu = bleu_eval.evaluate_strings(
    prediction="Warsaw is the capital of Poland",
    reference="The capital of Poland is Warsaw"
)
print("BLEU:", result_bleu)

# ROUGE evaluator
rouge_eval = load_evaluator("string_distance", metric="rouge")

result_rouge = rouge_eval.evaluate_strings(
    prediction="Warsaw is capital",
    reference="Warsaw is the capital of Poland"
)
print("ROUGE:", result_rouge)

# METEOR evaluator
meteor_eval = load_evaluator("string_distance", metric="meteor")

result_meteor = meteor_eval.evaluate_strings(
    prediction="The dog runs quickly",
    reference="The dog is running fast"
)
print("METEOR:", result_meteor)


BLEU: {'score': 0.28903225806451616}
ROUGE: {'score': 0.11385199240986721}
METEOR: {'score': 0.30186335403726705}


### A/B Testing

In [None]:
from langchain.evaluation import PairwiseStringEvaluator

evaluator = PairwiseStringEvaluator()

result = evaluator.evaluate_string_pairs(
    prediction="Warsaw is the capital of Poland",
    prediction_b="The capital city of Poland is Warsaw",
    reference="Warsaw is Poland's capital"
)

print(result)  # np. {'winner': 'prediction_b', 'score': 0.9}


### Evaluate LLM by LLM

In [12]:
### Evaluate LLM by LLM
# before start fill env variables .env file:
# LANGCHAIN_API_KEY="put_here_your_langchain(langsmith)_api_token"
# OPENAI_API_KEY="put_here_your_openai_token"
# HUGGINGFACE_API_TOKEN="put_here_your_huggingface_token"
# to use OpenAI API you need to add billing details https://platform.openai.com/settings/organization/billing/overview
# for langchain token remember to add read permissions associated with token
from dotenv import load_dotenv
from langchain.chains import LLMChain
from langchain.prompts.prompt import PromptTemplate
from langchain.llms import OpenAI
from langchain_openai import ChatOpenAI
from langchain.evaluation import load_evaluator

#load dotenv (API key from .env)
load_dotenv()

True

In [21]:
llm = ChatOpenAI(model_name="gpt-4o")
template = """
You are base of knowledge about star wars. Respond to question below with only name without any additional text.
{input}
"""
prompt_template = PromptTemplate.from_template(template=template)
chain = prompt_template | llm
prediction = chain.invoke({"input": "What is the capital of star wars Sith Empire?"})

evaluator = load_evaluator("labeled_score_string", llm=ChatOpenAI(model="gpt-4o"))
eval_result = evaluator.evaluate_strings(
    prediction=prediction,
    reference="Dromund Kaas",
    input="What is the capital of star wars Sith Empire?",
)
print(eval_result)

{'reasoning': 'The AI assistant has responded with "Dromund Kaas" as the capital of the Sith Empire in the Star Wars universe. This response is correct and relevant. "Dromund Kaas" is indeed known as the capital of the Sith Empire, especially during periods depicted in the expanded universe materials such as the "Star Wars: The Old Republic" video game and associated lore. The response is also concise and directly answers the user\'s question without unnecessary information, which can be seen as a lack of depth but is typically acceptable for straightforward factual queries. The assistant\'s response does not provide additional context or background, which limits depth but maintains correctness and relevance.\n\nRating: [[9]]', 'score': 9}


In [17]:
model = ChatOpenAI(temperature=0)
template = """You are an expert in grading answers.
You are grading the following question:
{query}
Here is the correct expected answer:
{answer}
You are grading the following predicted answer:
{result}
What grade do you give from 0 to 5, where 0 is the lowest for low similarity and 5 is for the high similarity?
"""

prompt = PromptTemplate(
    input_variables=["query", "answer", "result"], template=template
)

In [18]:
context_examples = [
    {
        "question": "Why people don't brief underwater?",
        "context": "Because people don't have gills",
    },
    {
        "question": "Why the sky is blue?",
        "context": "Sky isn't blue. Its just optical effect related to sun rays coming to eye through atmosphere and interpretation in our mind.",
    },
    {
        "question": "What is in my pocket?",
        "context": "",
    },
]
prompt_qa = "Answer the question based on the  context\nContext:{context}\nQuestion:{question}\nAnswer:"
template = PromptTemplate(input_variables=["context", "question"], template=prompt_qa)
qa_chain = LLMChain(llm=model, prompt=template)
predictions = qa_chain.apply(context_examples)
predictions

[{'text': "Because people don't have gills, they are not able to breathe underwater."},
 {'text': 'The sky appears blue due to an optical effect caused by sun rays passing through the atmosphere and being interpreted by our eyes and mind.'},
 {'text': "I'm sorry, I cannot answer that question as I do not have the ability to see or know what is in your pocket."}]

In [16]:
from langchain.evaluation.qa import ContextQAEvalChain

eval_chain = ContextQAEvalChain.from_llm(model)
graded_outputs = eval_chain.evaluate(
    context_examples, predictions, question_key="question", prediction_key="text"
)
print(graded_outputs)

[{'text': ' CORRECT'}, {'text': ' CORRECT'}, {'text': ' CORRECT'}]
