## Criteria evaluator

### Install libraries

In [27]:
!pip install -U langsmith langchain langchain-core langchain-text-splitters



In [28]:
from dotenv import load_dotenv
load_dotenv()

True

### Simple evaluation using criteria
The Criteria evaluator in LangChain is used to evaluate the results of LLM models according to a given criterion, e.g. correctness.

In [29]:
from langchain_classic.evaluation import load_evaluator
import json

# 1) We will use an evaluator with a reference (labeled_criteria)
evaluator = load_evaluator("labeled_criteria", criteria="correctness")

# 2) We compare the model response with the reference
result = evaluator.evaluate_strings(
    prediction="2 + 2 = 4",
    input="Policz 2 + 2",
    reference="4",
)

print(json.dumps(result, indent=4))

{
    "reasoning": "The criterion for this task is the correctness of the submitted answer. The input asks to calculate 2 + 2. The submitted answer is 2 + 2 = 4. Comparing this with the reference answer, which is 4, it is clear that the submitted answer is correct. Therefore, the submission meets the criterion of correctness.\n\nY",
    "value": "Y",
    "score": 1
}


### Import libraries and LangSmith Configuration

In [30]:
import os
from langsmith import Client
from langchain_openai import ChatOpenAI

# Enable LangSmith tracking (requires LangSmith account):
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = "kurs-demo"
os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
#  os.environ["LANGSMITH_API_KEY"] = "<TWÓJ_KLUCZ>" #załączony w .env

### Generate LLM response

In [31]:
client = Client()
dataset_inputs = [
    "Why people don't have 3 legs?",
    "Why people are not flying?",
]

llm_test= ChatOpenAI(model="gpt-3.5-turbo", temperature=0.1,max_tokens=256)
llm_gen = ChatOpenAI(model="gpt-4o", temperature=0.1,max_tokens=256)

dataset_outputs = [
    {"result": llm_test.invoke(dataset_inputs[0])},
    {"result": llm_test.invoke(dataset_inputs[1])},
]
print(dataset_outputs)

[{'result': AIMessage(content='Humans typically have two legs because that is the natural and most efficient way for our bodies to move and function. Having three legs would likely be more cumbersome and less practical for everyday activities. Additionally, evolution has shaped humans to have two legs for millions of years, so there has been no need for a third leg to develop.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 65, 'prompt_tokens': 16, 'total_tokens': 81, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-CaH0dQgxpckvaPn6HTBTg7B7A7MP3', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='lc_run--40250ed3-5068-4c99-aecb-e9d6f55e433d-0', usage_meta

### Custom evaluator

In [32]:
from langchain_classic.smith import RunEvalConfig, run_on_dataset
from langchain_classic.evaluation import Criteria, EvaluatorType
from langsmith.evaluation import EvaluationResult, run_evaluator

In [33]:
@run_evaluator
def custom_evaluator(run) -> EvaluationResult:
    """
    checks if output contains specific word
    :param run:
    :return: int
    """
    generated = str(run).lower()
    if 'human' in generated:
        score = 1
    else:
        score = 0
    return EvaluationResult(key="result", score=score)

eval_config = RunEvalConfig(
    custom_evaluators=[custom_evaluator],
    evaluators=[
        EvaluatorType.CRITERIA,
        EvaluatorType.QA,         #  directly rate the answer as "correct" or "incorrect" based on the reference answer
        EvaluatorType.CONTEXT_QA, #  use the given reference context to determine correctness
        EvaluatorType.COT_QA,     # chain of thought "reasoning"
        RunEvalConfig.Criteria(criteria=Criteria.INSENSITIVITY),
        RunEvalConfig.Criteria(criteria=Criteria.RELEVANCE),
        RunEvalConfig.Criteria(criteria=Criteria.HELPFULNESS),
        RunEvalConfig.Criteria(criteria=Criteria.MALICIOUSNESS),
        RunEvalConfig.Criteria(criteria=Criteria.HARMFULNESS),
        RunEvalConfig.Criteria(criteria=Criteria.COHERENCE),
        RunEvalConfig.Criteria(criteria=Criteria.CONCISENESS),
        RunEvalConfig.Criteria(criteria=Criteria.MISOGYNY),
        RunEvalConfig.Criteria(criteria=Criteria.CRIMINALITY),
        RunEvalConfig.Criteria(criteria=Criteria.CONTROVERSIALITY),
        RunEvalConfig.Criteria(  #  own defined criteria that relate to the problem occurring in the generated responses
            criteria={
                "valuation": "Do texts contain valuation of subject, like glorifying some characteristic or judging someone?"
                "Respond Y if they do, N if they're entirely objective and stick to the facts without additions."
            }
        )
    ],
)

### Launch and integration with LangSmith

In [34]:
import uuid

dataset_name = "existential questions run:" + uuid.uuid4().__str__() # parameter change required on each startup

dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="evaluate LLM output",
)
client.create_examples(
    inputs=[{"question": q} for q in dataset_inputs],
    outputs=dataset_outputs,
    dataset_id=dataset.id,
)

{'example_ids': ['fc75d912-cded-4df2-9800-b7319728a176',
  '56f93dbc-950a-4f2b-9f4a-beef0e3ad64b'],
 'count': 2}

In [35]:
eval_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

def construct_chain():
    return llm_gen

scores = run_on_dataset(
    client=client,
    dataset_name=dataset_name,
    llm_or_chain_factory=construct_chain,
    evaluation=eval_config,
    project_name=dataset_name,
    verbose=True,
)
print(scores)

View the evaluation results for project 'existential questions run:58c1ea9e-277f-404c-b036-aba5d971ed6f' at:
https://smith.langchain.com/o/3e1f981e-76ef-5491-9a42-e33f3bdfeba4/datasets/4e8a19be-df66-4cd9-aa94-51aee4acac12/compare?selectedSessions=d7303f4a-462d-4b59-a16b-1b895c9dbdad

View all tests for Dataset existential questions run:58c1ea9e-277f-404c-b036-aba5d971ed6f at:
https://smith.langchain.com/o/3e1f981e-76ef-5491-9a42-e33f3bdfeba4/datasets/4e8a19be-df66-4cd9-aa94-51aee4acac12
[------------------------------------------------->] 2/2


Unnamed: 0,feedback.helpfulness,feedback.correctness,feedback.Contextual Accuracy,feedback.COT Contextual Accuracy,feedback.insensitivity,feedback.relevance,feedback.maliciousness,feedback.harmfulness,feedback.coherence,feedback.conciseness,feedback.misogyny,feedback.criminality,feedback.controversiality,feedback.valuation,feedback.result,error,execution_time,run_id
count,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,2.0,2
unique,,,,,,,,,,,,,,,,0.0,,2
top,,,,,,,,,,,,,,,,,,38290470-5473-4b06-a1ea-ff82f41c927e
freq,,,,,,,,,,,,,,,,,,1
mean,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,1.0,,5.890828,
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,,0.0947,
min,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,5.823865,
25%,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,1.0,,5.857347,
50%,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,1.0,,5.890828,
75%,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.75,0.0,0.0,0.0,0.0,0.0,1.0,,5.924309,


{'project_name': 'existential questions run:58c1ea9e-277f-404c-b036-aba5d971ed6f', 'results': {'56f93dbc-950a-4f2b-9f4a-beef0e3ad64b': {'input': {'question': "Why people don't have 3 legs?"}, 'feedback': [EvaluationResult(key='helpfulness', score=1, value='Y', comment='The criterion for this task is "helpfulness". The submission should be evaluated based on whether it is helpful, insightful, and appropriate.\n\nLooking at the AI\'s response, it provides a detailed explanation of why humans have two legs instead of three. It covers various aspects such as bipedalism, balance and coordination, evolutionary path, and energy efficiency. This shows that the AI\'s response is insightful as it provides a comprehensive understanding of the topic.\n\nThe response is also helpful as it directly answers the question asked by the user. It provides clear and understandable reasons that a person without a background in biology or evolution could understand.\n\nThe response is appropriate as it stays