## Criteria evaluator

### Criteria evaluator

In [None]:
from langchain.evaluation import load_evaluator

# Wczytujemy evaluator kryterialny
evaluator = load_evaluator("criteria", criteria="correctness")

# Porównujemy odpowiedź z pytaniem
result = evaluator.evaluate_strings(
    prediction="2 + 2 = 5",
    input="Policz 2 + 2",
    reference="4"
)

print(result)  # {'score': 1, 'reasoning': 'Odpowiedź jest błędna'}


### Custom evaluator

In [None]:
import os
from langsmith import Client
from langchain_openai import ChatOpenAI

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "GEMMA EVALUATIONs"
client = Client()
# Inputs are provided to your model, so it know what to generate
dataset_inputs = [
    "Why people don't have 3 legs?",
    "Why people are not flying?",
]

#use 1st LLM for generating texts
llm_test= ChatOpenAI(model="gpt-3.5-turbo", temperature=0.1,max_tokens=256)
# llm_gen = ChatOpenAI(base_url="https://api-inference.huggingface.com/v1", model="google/gemma-2b-it",temperature=0.1,max_tokens=256)
# and 2nd to evaluate different criteria of response generated with 1st LLM
llm_gen = ChatOpenAI(model="gpt-4o", temperature=0.1,max_tokens=256)

dataset_outputs = [
    {"result": llm_test.invoke(50*dataset_inputs[0])},
    {"result": llm_test.invoke(50*dataset_inputs[1])},
]
print(dataset_outputs)

In [1]:
from langchain.smith import RunEvalConfig
from langsmith.evaluation import EvaluationResult, run_evaluator

@run_evaluator
def custom_evaluator(run, example) -> EvaluationResult:
    """
    checks if output contains specific word
    :param run:
    :param example:
    :return: int
    """
    generated = run.outputs["generations"][0][0]["text"]
    if 'human' in generated:
        score = 1
    else:
        score = 0
    return EvaluationResult(key="result", score=score)

In [2]:
eval_config = RunEvalConfig(
    custom_evaluators=[custom_evaluator],
    evaluators=[
        "criteria",
        "qa",         #directly grade a response as "correct" or "incorrect" based on the reference answer
        "context_qa", #use the provided reference context in determining correctness
        "cot_qa",     #use chain of thought "reasoning" before determining a final verdict. This tends to lead to responses that better correlate with human labels
        RunEvalConfig.Criteria("insensitivity"),
        RunEvalConfig.Criteria("relevance"),
        RunEvalConfig.Criteria("helpfulness"),
        RunEvalConfig.Criteria("maliciousness"),
        RunEvalConfig.Criteria("harmfulness"),
        RunEvalConfig.Criteria("coherence"),
        RunEvalConfig.Criteria("conciseness"),
        RunEvalConfig.Criteria("misogyny"),
        RunEvalConfig.Criteria("criminality"),
        RunEvalConfig.Criteria("controversiality"),
        RunEvalConfig.Criteria( #custom defined criteria related to specific problem we want to solve and problems detected in output
            {
                "valuation": "Do texts contain valuation of subject, like glorifying some characteristic or judging someone?"
                " Respond Y if they do, N if they're entirely objective and stick to the facts without additions."
            }
        )
    ],
)

In [None]:
import uuid

dataset_name = "existential questions run:" + uuid.uuid4().__str__() #need to modify this value on every run of notebook

# Storing inputs in a dataset lets us
# run chains and LLMs over a shared set of examples.
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="evaluate LLM output",
)
client.create_examples(
    inputs=[{"question": q} for q in dataset_inputs],
    outputs=dataset_outputs,
    dataset_id=dataset.id,
)

In [None]:
#in case of error ‘model is currently loading;’, wait couple of minutes and run notebook again
scores = client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=llm_gen,
    evaluation=eval_config,
    verbose=True,
    project_name=dataset_name,
)
print(scores)