## Criteria evaluator

In [23]:
! pip install langchain



In [24]:
from dotenv import load_dotenv
load_dotenv()

True

### Prosta ewaluacja za pomocą kryteriów
Criteria evaluator w LangChain służy do oceniania wyników modeli LLM według zadanego kryterium, np. correctness (poprawność).

In [25]:
from langchain_classic.evaluation import load_evaluator
import json

# 1) Użyjemy ewaluatora z referencją (labeled_criteria)
evaluator = load_evaluator("labeled_criteria", criteria="correctness")

# 2) Porównujemy odpowiedź modelu z referencją
result = evaluator.evaluate_strings(
    prediction="2 + 2 = 4",
    input="Policz 2 + 2",
    reference="4",
)

print(json.dumps(result, indent=4))

{
    "reasoning": "The criterion for this task is the correctness of the submitted answer. The input asks to calculate 2 + 2. The submitted answer is 2 + 2 = 4. The reference answer is 4. The submitted answer matches the reference answer, which means the submission is correct, accurate, and factual. Therefore, the submission meets the criterion.\n\nY",
    "value": "Y",
    "score": 1
}


### Import bibliotek i konfiguracja LangSmith

In [26]:
import os
from langsmith import Client
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
load_dotenv()

# Włącz śledzenie LangSmith (wymaga konta LangSmith):
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = "kurs-demo"
os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
# os.environ["LANGSMITH_API_KEY"] = "<TWÓJ_KLUCZ>" #załączony w .env

### Wygenerowanie odpowiedzi LLM

In [27]:
client = Client()
dataset_inputs = [
    "Why people don't have 3 legs?",
    "Why people are not flying?",
]

llm_test= ChatOpenAI(model="gpt-3.5-turbo", temperature=0.1,max_tokens=256)
llm_gen = ChatOpenAI(model="gpt-4o", temperature=0.1,max_tokens=256)

dataset_outputs = [
    {"result": llm_test.invoke(dataset_inputs[0])},
    {"result": llm_test.invoke(dataset_inputs[1])},
]
print(dataset_outputs)

[{'result': AIMessage(content='People do not have three legs because humans are typically born with two legs as part of their natural anatomy. The human body has evolved over millions of years to have two legs for walking and running efficiently. Having a third leg would likely be unnecessary and could potentially hinder movement and balance. Additionally, having an odd number of legs would not be symmetrical and could cause issues with coordination and stability.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 78, 'prompt_tokens': 16, 'total_tokens': 94, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-CZx7V734Ub0Fy3FLYsWHRKojzvhEc', 'service_tier': 'default', 'finish_reason': '

### Custom evaluator

In [28]:
from langchain_classic.smith import RunEvalConfig
from langsmith.evaluation import EvaluationResult, run_evaluator


In [29]:
@run_evaluator
def custom_evaluator(run) -> EvaluationResult:
    """
    checks if output contains specific word
    :param run:
    :return: int
    """
    generated = run.outputs["generations"][0][0]["text"]
    if 'human' in generated:
        score = 1
    else:
        score = 0
    return EvaluationResult(key="result", score=score)

eval_config = RunEvalConfig(
    custom_evaluator=[custom_evaluator],
    evaluators=[
        "criteria",
        "qa",         # bezpośrednio oceń odpowiedź jako "poprawną" lub "niepoprawną" na podstawie odpowiedzi referencyjnej
        "context_qa", # użyj podanego kontekstu odniesienia w celu ustalenia poprawności
        "cot_qa",     # chain of thought "reasoning"
        RunEvalConfig.Criteria(name="insensitivity"),
        RunEvalConfig.Criteria(name="relevance"),
        RunEvalConfig.Criteria(name="helpfulness"),
        RunEvalConfig.Criteria(name="maliciousness"),
        RunEvalConfig.Criteria(name="harmfulness"),
        RunEvalConfig.Criteria(name="coherence"),
        RunEvalConfig.Criteria(name="conciseness"),
        RunEvalConfig.Criteria(name="misogyny"),
        RunEvalConfig.Criteria(name="criminality"),
        RunEvalConfig.Criteria(name="controversiality"),
        RunEvalConfig.Criteria( # własne zdefiniowane kryteria, które dotyczą problemu wystepujacego w generowanych odpowiedziach
            name={
                "valuation": "Do texts contain valuation of subject, like glorifying some characteristic or judging someone?"
                "Respond Y if they do, N if they're entirely objective and stick to the facts without additions."
            }
        )
    ],
)

### Uruchomienie i integracja z LangSmith

In [30]:
import uuid

dataset_name = "existential questions run:" + uuid.uuid4().__str__() #wymagana zmiana parametru przy każdym uruchomieniu

dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="evaluate LLM output",
)
client.create_examples(
    inputs=[{"question": q} for q in dataset_inputs],
    outputs=dataset_outputs,
    dataset_id=dataset.id,
)

{'example_ids': ['25d47eab-10d6-4327-a8ae-9e9dffb0668a',
  '87ae9cd8-84b4-4f1e-b82f-813d8ca2a499'],
 'count': 2}

In [31]:
#in case of error ‘model is currently loading;’, wait couple of minutes and run notebook again
scores = client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=llm_gen,
    evaluation=eval_config,
    verbose=True,
    project_name=dataset_name,
)
print(scores)

ImportError: The client.run_on_dataset function requires the langchainpackage to run.
Install with pip install langchain