## Criteria evaluator

### Instalacja bibliotek

In [104]:
! pip install -U langsmith langchain langchain-core langchain-text-splitters



In [105]:
from dotenv import load_dotenv
load_dotenv()

True

### Prosta ewaluacja za pomocą kryteriów
Criteria evaluator w LangChain służy do oceniania wyników modeli LLM według zadanego kryterium, np. correctness (poprawność).

In [106]:
from langchain_classic.evaluation import load_evaluator, Criteria, EvaluatorType
import json

# 1) Użyjemy ewaluatora z referencją (labeled_criteria)
evaluator = load_evaluator("labeled_criteria", criteria="correctness")

# 2) Porównujemy odpowiedź modelu z referencją
result = evaluator.evaluate_strings(
    prediction="2 + 2 = 4",
    input="Policz 2 + 2",
    reference="4",
)

print(json.dumps(result, indent=4))

{
    "reasoning": "The criterion for this task is the correctness of the submission. The input asks to calculate 2 + 2. The submission provided is 2 + 2 = 4. The reference answer is 4. \n\nComparing the submission to the reference, we can see that the submission is correct. The calculation 2 + 2 does indeed equal 4, which matches the reference answer. \n\nTherefore, the submission meets the criterion of correctness. \n\nY",
    "value": "Y",
    "score": 1
}


### Import bibliotek i konfiguracja LangSmith

In [107]:
import os
from langsmith import Client
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
load_dotenv()

# Włącz śledzenie LangSmith (wymaga konta LangSmith):
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = "kurs-demo"
os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
# os.environ["LANGSMITH_API_KEY"] = "<TWÓJ_KLUCZ>" #załączony w .env

### Wygenerowanie odpowiedzi LLM

In [108]:
client = Client()
dataset_inputs = [
    "Why people don't have 3 legs?",
    "Why people are not flying?",
]

llm_test= ChatOpenAI(model="gpt-3.5-turbo", temperature=0.1,max_tokens=256)
llm_gen = ChatOpenAI(model="gpt-4o", temperature=0.1,max_tokens=256)

dataset_outputs = [
    {"result": llm_test.invoke(dataset_inputs[0])},
    {"result": llm_test.invoke(dataset_inputs[1])},
]
print(dataset_outputs)

[{'result': AIMessage(content='People do not have three legs because humans are typically born with two legs as part of their natural anatomy. The human body has evolved over millions of years to have two legs for walking and running efficiently. Having three legs would likely be a genetic anomaly or mutation that is not advantageous for survival and reproduction. Additionally, having an odd number of legs could potentially cause balance and coordination issues.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 76, 'prompt_tokens': 16, 'total_tokens': 92, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-CaH2wRQEs0v8U3aHk1wswf14WCZAQ', 'service_tier': 'default', 'finish_reason': 'st

### Custom evaluator

In [109]:
from langchain_classic.smith import RunEvalConfig, run_on_dataset
from langsmith.evaluation import EvaluationResult, run_evaluator

In [110]:
@run_evaluator
def custom_evaluator(run) -> EvaluationResult:
    """
    checks if output contains specific word
    :param run:
    :return: int
    """
    generated = str(run).lower()
    if 'human' in generated:
        score = 1
    else:
        score = 0
    return EvaluationResult(key="result", score=score)

eval_config = RunEvalConfig(
    custom_evaluators=[custom_evaluator],
    evaluators=[
        EvaluatorType.CRITERIA,
        EvaluatorType.QA,         # bezpośrednio oceń odpowiedź jako "poprawną" lub "niepoprawną" na podstawie odpowiedzi referencyjnej
        EvaluatorType.CONTEXT_QA, # użyj podanego kontekstu odniesienia w celu ustalenia poprawności
        EvaluatorType.COT_QA,     # chain of thought "reasoning"
        RunEvalConfig.Criteria(criteria=Criteria.INSENSITIVITY),
        RunEvalConfig.Criteria(criteria=Criteria.RELEVANCE),
        RunEvalConfig.Criteria(criteria=Criteria.HELPFULNESS),
        RunEvalConfig.Criteria(criteria=Criteria.MALICIOUSNESS),
        RunEvalConfig.Criteria(criteria=Criteria.HARMFULNESS),
        RunEvalConfig.Criteria(criteria=Criteria.COHERENCE),
        RunEvalConfig.Criteria(criteria=Criteria.CONCISENESS),
        RunEvalConfig.Criteria(criteria=Criteria.MISOGYNY),
        RunEvalConfig.Criteria(criteria=Criteria.CRIMINALITY),
        RunEvalConfig.Criteria(criteria=Criteria.CONTROVERSIALITY),
        RunEvalConfig.Criteria( # własne zdefiniowane kryteria, które dotyczą problemu wystepujacego w generowanych odpowiedziach
            criteria={
                "valuation": "Do texts contain valuation of subject, like glorifying some characteristic or judging someone?"
                "Respond Y if they do, N if they're entirely objective and stick to the facts without additions."
            }
        )
    ],
)

### Uruchomienie i integracja z LangSmith

In [111]:
import uuid

dataset_name = "existential questions run:" + uuid.uuid4().__str__() #wymagana zmiana parametru przy każdym uruchomieniu

dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="evaluate LLM output",
)
client.create_examples(
    inputs=[{"question": q} for q in dataset_inputs],
    outputs=dataset_outputs,
    dataset_id=dataset.id,
)

{'example_ids': ['4701aea1-691d-4136-a9c4-d3b46216664c',
  '234011b0-33c5-4c2a-9048-d2ad6ad47f09'],
 'count': 2}

In [112]:
eval_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

def construct_chain():
    return llm_gen

scores = run_on_dataset(
    client=client,
    dataset_name=dataset_name,
    llm_or_chain_factory=construct_chain,
    evaluation=eval_config,
    project_name=dataset_name,
    verbose=True,
)
print(scores)


View the evaluation results for project 'existential questions run:5f48150e-b9c3-4a7f-911a-fcbb0710a2ad' at:
https://smith.langchain.com/o/3e1f981e-76ef-5491-9a42-e33f3bdfeba4/datasets/94a4a4d4-398d-414f-9759-e8da117ad25b/compare?selectedSessions=bdcaec62-ab61-40bf-81b2-26a72744c2bf

View all tests for Dataset existential questions run:5f48150e-b9c3-4a7f-911a-fcbb0710a2ad at:
https://smith.langchain.com/o/3e1f981e-76ef-5491-9a42-e33f3bdfeba4/datasets/94a4a4d4-398d-414f-9759-e8da117ad25b
[------------------------------------------------->] 2/2


Unnamed: 0,feedback.helpfulness,feedback.correctness,feedback.Contextual Accuracy,feedback.COT Contextual Accuracy,feedback.insensitivity,feedback.relevance,feedback.maliciousness,feedback.harmfulness,feedback.coherence,feedback.conciseness,feedback.misogyny,feedback.criminality,feedback.controversiality,feedback.valuation,feedback.result,error,execution_time,run_id
count,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,2.0,2
unique,,,,,,,,,,,,,,,,0.0,,2
top,,,,,,,,,,,,,,,,,,c7f863cb-ffbc-40c4-a212-5a7b112b4043
freq,,,,,,,,,,,,,,,,,,1
mean,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,1.0,,5.512837,
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,,0.995484,
min,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,,4.808924,
25%,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,1.0,,5.160881,
50%,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,1.0,,5.512837,
75%,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.75,0.0,0.0,0.0,0.0,1.0,,5.864794,


{'project_name': 'existential questions run:5f48150e-b9c3-4a7f-911a-fcbb0710a2ad', 'results': {'234011b0-33c5-4c2a-9048-d2ad6ad47f09': {'input': {'question': 'Why people are not flying?'}, 'feedback': [EvaluationResult(key='helpfulness', score=1, value='Y', comment='The criterion for this task is the helpfulness of the submission. \n\nThe submission is a response to the question "Why people are not flying?" The AI provides a comprehensive list of potential reasons why people might not be flying as much at a given time. These reasons include pandemic concerns, travel restrictions, economic factors, environmental concerns, technological alternatives, security concerns, service and experience, cost, and convenience. \n\nEach reason is explained in a clear and concise manner, providing the user with a thorough understanding of the various factors that could deter people from flying. \n\nThe submission is insightful as it covers a wide range of factors, some of which the user might not have