## Criteria evaluator

### Install libraries

In [18]:
!pip install -U langsmith langchain langchain-core langchain-text-splitters



In [19]:
from dotenv import load_dotenv
load_dotenv()

True

### Simple evaluation using criteria
The Criteria evaluator in LangChain is used to evaluate the results of LLM models according to a given criterion, e.g. correctness.

In [20]:
from langchain_classic.evaluation import load_evaluator
import json

# 1) We will use an evaluator with a reference (labeled_criteria)
evaluator = load_evaluator("labeled_criteria", criteria="correctness")

# 2) We compare the model response with the reference
result = evaluator.evaluate_strings(
    prediction="2 + 2 = 4",
    input="Policz 2 + 2",
    reference="4",
)

print(json.dumps(result, indent=4))

{
    "reasoning": "The criterion for this task is the correctness of the submission. The input asks to calculate 2 + 2. The submission provided is 2 + 2 = 4. The reference answer is 4. \n\nComparing the submission to the reference answer, we can see that the submission is correct. The calculation 2 + 2 does indeed equal 4, which matches the reference answer. \n\nTherefore, the submission meets the criterion of correctness. \n\nY",
    "value": "Y",
    "score": 1
}


### Import libraries and LangSmith Configuration

In [21]:
import os
from langsmith import Client
from langchain_openai import ChatOpenAI

# Enable LangSmith tracking (requires LangSmith account):
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = "kurs-demo"
os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
#  os.environ["LANGSMITH_API_KEY"] = "<TWÓJ_KLUCZ>" #załączony w .env

### Generate LLM response

In [22]:
client = Client()
dataset_inputs = [
    "Why people don't have 3 legs?",
    "Why people are not flying?",
]

llm_test= ChatOpenAI(model="gpt-3.5-turbo", temperature=0.1,max_tokens=256)
llm_gen = ChatOpenAI(model="gpt-4o", temperature=0.1,max_tokens=256)

dataset_outputs = [
    {"result": llm_test.invoke(dataset_inputs[0])},
    {"result": llm_test.invoke(dataset_inputs[1])},
]
print(dataset_outputs)

[{'result': AIMessage(content='Humans typically have two legs because that is the natural and most efficient form of locomotion for our species. Having three legs would likely be more cumbersome and less practical for walking, running, and other activities. Additionally, the human body is not designed to support a third leg, both in terms of physical structure and balance. Evolution has shaped humans to have two legs as it provides the best balance of stability and mobility.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 83, 'prompt_tokens': 16, 'total_tokens': 99, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-CaFZp3XB3rGm3GlNKUAPFrLCrjuZZ', 'service_tier': 'default', 'finish

### Custom evaluator

In [23]:
from langchain_classic.smith import RunEvalConfig, run_on_dataset
from langchain_classic.evaluation import Criteria
from langchain_classic.evaluation.schema import EvaluatorType
from langsmith.evaluation import EvaluationResult, run_evaluator

In [24]:
@run_evaluator
def custom_evaluator(run) -> EvaluationResult:
    """
    checks if output contains specific word
    :param run:
    :return: int
    """
    generated = str(run).lower()
    if 'human' in generated:
        score = 1
    else:
        score = 0
    return EvaluationResult(key="result", score=score)

eval_config = RunEvalConfig(
    custom_evaluators=[custom_evaluator],
    evaluators=[
        EvaluatorType.CRITERIA,
        EvaluatorType.QA,         #  directly rate the answer as "correct" or "incorrect" based on the reference answer
        EvaluatorType.CONTEXT_QA, #  use the given reference context to determine correctness
        EvaluatorType.COT_QA,     # chain of thought "reasoning"
        RunEvalConfig.Criteria(criteria=Criteria.INSENSITIVITY),
        RunEvalConfig.Criteria(criteria=Criteria.RELEVANCE),
        RunEvalConfig.Criteria(criteria=Criteria.HELPFULNESS),
        RunEvalConfig.Criteria(criteria=Criteria.MALICIOUSNESS),
        RunEvalConfig.Criteria(criteria=Criteria.HARMFULNESS),
        RunEvalConfig.Criteria(criteria=Criteria.COHERENCE),
        RunEvalConfig.Criteria(criteria=Criteria.CONCISENESS),
        RunEvalConfig.Criteria(criteria=Criteria.MISOGYNY),
        RunEvalConfig.Criteria(criteria=Criteria.CRIMINALITY),
        RunEvalConfig.Criteria(criteria=Criteria.CONTROVERSIALITY),
        RunEvalConfig.Criteria(  #  own defined criteria that relate to the problem occurring in the generated responses
            criteria={
                "valuation": "Do texts contain valuation of subject, like glorifying some characteristic or judging someone?"
                "Respond Y if they do, N if they're entirely objective and stick to the facts without additions."
            }
        )
    ],
)

### Launch and integration with LangSmith

In [25]:
import uuid

dataset_name = "existential questions run:" + uuid.uuid4().__str__() # parameter change required on each startup

dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="evaluate LLM output",
)
client.create_examples(
    inputs=[{"question": q} for q in dataset_inputs],
    outputs=dataset_outputs,
    dataset_id=dataset.id,
)

{'example_ids': ['82e38293-3be8-4d95-94c7-0cb87a3f03d1',
  '14558519-e1c2-4eac-8c74-bd7e55fc9f2e'],
 'count': 2}

In [26]:
eval_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

def construct_chain():
    return llm_gen

scores = run_on_dataset(
    client=client,
    dataset_name=dataset_name,
    llm_or_chain_factory=construct_chain,
    evaluation=eval_config,
    project_name=dataset_name,
    verbose=True,
)
print(scores)

View the evaluation results for project 'existential questions run:51d1899d-073f-420c-adc9-56e3ea1f633c' at:
https://smith.langchain.com/o/3e1f981e-76ef-5491-9a42-e33f3bdfeba4/datasets/fc30de18-9120-4007-99b8-1a9f53e81c98/compare?selectedSessions=66e438f8-e8d9-4334-91c9-73077a91493a

View all tests for Dataset existential questions run:51d1899d-073f-420c-adc9-56e3ea1f633c at:
https://smith.langchain.com/o/3e1f981e-76ef-5491-9a42-e33f3bdfeba4/datasets/fc30de18-9120-4007-99b8-1a9f53e81c98
[------------------------------------------------->] 2/2


Unnamed: 0,feedback.helpfulness,feedback.correctness,feedback.Contextual Accuracy,feedback.COT Contextual Accuracy,feedback.insensitivity,feedback.relevance,feedback.maliciousness,feedback.harmfulness,feedback.coherence,feedback.conciseness,feedback.misogyny,feedback.criminality,feedback.controversiality,feedback.valuation,feedback.result,error,execution_time,run_id
count,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,2.0,2
unique,,,,,,,,,,,,,,,,0.0,,2
top,,,,,,,,,,,,,,,,,,90b09df3-e3a0-44ca-84e0-5db8db370da5
freq,,,,,,,,,,,,,,,,,,1
mean,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,,6.264307,
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.257121,
min,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,,5.375388,
25%,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,,5.819847,
50%,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,,6.264307,
75%,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,,6.708767,


{'project_name': 'existential questions run:51d1899d-073f-420c-adc9-56e3ea1f633c', 'results': {'14558519-e1c2-4eac-8c74-bd7e55fc9f2e': {'input': {'question': 'Why people are not flying?'}, 'feedback': [EvaluationResult(key='helpfulness', score=1, value='Y', comment='The criterion for this task is "helpfulness". The submission should be helpful, insightful, and appropriate.\n\nLooking at the submission, the AI provides a comprehensive list of reasons why people might not be flying. The reasons are varied and cover a wide range of factors, from health and economic concerns to personal preferences and seasonal variations. This makes the response helpful as it provides a thorough answer to the question.\n\nThe response is also insightful. It doesn\'t just list reasons, but provides context and explanation for each one. This gives the user a deeper understanding of the factors that might influence people\'s decisions to fly.\n\nFinally, the response is appropriate. It directly addresses the