In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
from langsmith import Client
from langsmith.evaluation import evaluate
from langsmith.schemas import Example, Run

# Initialize LangSmith client
client = Client()

#### Dataset

In [None]:
dataset = client.create_dataset(
    "RAG-Evaluation",
    description="Un dataset para evaluar las respuestas del agente en temas de atencion al cliente utilizando RAG"
)

In [None]:
client.create_examples(
    inputs=[
        {"input": "¿Cuáles son los canales de atención al cliente que ofrece TechnoVerde?"},
        {"input": "¿Qué es el tiempo de respuesta para los diferentes canales de atención?"},
        {"input": "¿Cómo puedo devolver un producto en TechnoVerde?"},
        {"input": "¿Qué cubre la garantía de los productos TechnoVerde?"},
        {"input": "¿Cómo funciona el programa de lealtad de TechnoVerde?"},
        {"input": "¿Qué medidas toma TechnoVerde para proteger los datos personales?"},
        {"input": "¿Qué formación reciben los empleados de atención al cliente?"},
        {"input": "¿Qué hace TechnoVerde en caso de una crisis?"},
        {"input": "¿Qué tecnologías innovadoras usa TechnoVerde en el servicio al cliente?"},
        {"input": "¿Cómo garantiza TechnoVerde la accesibilidad en su servicio al cliente?"}
    ],
    outputs=[
        {"response": "TechnoVerde ofrece varios canales de atención al cliente: una línea telefónica gratuita (0800-TECNO-VERDE) disponible 24/7, correo electrónico (soporte@technoverde.com), chat en vivo en su sitio web (www.technoverde.com), redes sociales (@TechnoVerdeOficial), y atención presencial en sus tiendas de lunes a sábado de 9:00 a 20:00."},
        {"response": "El tiempo de respuesta objetivo es de menos de 3 minutos para llamadas telefónicas, menos de 4 horas hábiles para correos electrónicos, y menos de 1 minuto para el chat en vivo."},
        {"response": "Los clientes pueden devolver productos no utilizados dentro de los 30 días posteriores a la compra para un reembolso completo o un cambio, siempre que el producto esté en su empaque original y en condiciones de reventa."},
        {"response": "Todos los productos TechnoVerde tienen una garantía mínima de 2 años, con algunos productos premium que tienen hasta 5 años de garantía. Además, ofrecen soporte técnico gratuito durante la vida útil del producto."},
        {"response": "El programa de lealtad de TechnoVerde otorga puntos por cada compra (1 punto por cada $10 gastados). Existen niveles de membresía: Verde, Plata, Oro y Platino, cada uno con beneficios crecientes como descuentos, acceso a eventos exclusivos y envío gratuito."},
        {"response": "TechnoVerde recopila solo los datos necesarios para brindar sus servicios. Los datos se almacenan en servidores encriptados y se implementan medidas de seguridad avanzadas para proteger la información contra accesos no autorizados."},
        {"response": "Los empleados de atención al cliente reciben un programa intensivo de formación inicial de 4 semanas, que incluye conocimiento de productos, habilidades de comunicación y uso de sistemas. Además, hay programas de formación continua mensuales."},
        {"response": "TechnoVerde tiene un plan de continuidad del negocio que incluye la comunicación proactiva con los clientes y la búsqueda de soluciones alternativas en caso de crisis como desastres naturales o fallos tecnológicos."},
        {"response": "TechnoVerde utiliza inteligencia artificial para responder instantáneamente a preguntas frecuentes y analizar el sentimiento del cliente. También emplean realidad aumentada para guiar a los clientes en la instalación y solución de problemas."},
        {"response": "TechnoVerde asegura la accesibilidad mediante un diseño universal de su sitio web y aplicaciones móviles, compatibles con lectores de pantalla, y ofrece opciones de comunicación para clientes con discapacidades auditivas o del habla."}
    ],
    dataset_id=dataset.id
)

#### App

In [None]:
#AI App

from agent import Agent
from langchain_chroma import Chroma
from tools import create_retriever_tool_from_vectorstore, create_get_client_info_tool
from langchain_core.messages import HumanMessage, ToolMessage, AIMessage
from langchain_openai import OpenAIEmbeddings
persist_directory = "./chroma_db"
vectorstore = Chroma(
    collection_name="rag-chroma",
    embedding_function=OpenAIEmbeddings(),
    persist_directory=persist_directory
)

tools = [create_retriever_tool_from_vectorstore(vectorstore), create_get_client_info_tool()]

from langchain_core.prompts import ChatPromptTemplate

template = ChatPromptTemplate([
        ("system", "Sos un asistente que responde preguntas sobre la empresa TechnoVerde S.A. Para preguntas relacionadas a la empresa, responde utilizando la informacion que tenes disponible sobre la misma, no inventes informacion. Si no conoces la respuesta, simplemente decí que no lo sabes y disculpate por no poder ayudar"),
    ])

In [None]:
def call_agent(input_dict):
    try:
        agent = Agent(model_type="openai", prompt=template, tools=tools)
        human_message = HumanMessage(content=input_dict['input'])
        state = agent.invoke([human_message])
        
        if isinstance(state, dict) and "messages" in state and len(state["messages"]) > 0:
            messages = state["messages"]
            result = {"answer": "", "retrieved_content": ""}
            
            for message in messages:
                if isinstance(message, ToolMessage) and message.name == "retrieve_company_docs":
                    result["retrieved_content"] = message.content
            
            last_message = messages[-1]
            if isinstance(last_message, AIMessage):
                result["answer"] = last_message.content
            
            return result
        
        return {"answer": "No AI response found in the state.", "retrieved_content": ""}
    except Exception as e:
        return {"error": str(e), "retrieved_content": ""}

In [None]:
call_agent({"input": "¿Cuáles son los canales de atención al cliente que ofrece TechnoVerde?"})

#### Custom Evaluators

In [None]:
from typing import Optional, Dict, Any

from pydantic import BaseModel, Field


# Pydantic
class EvaluationResult(BaseModel):
    """Evaluation result from LLM."""

    key: str = Field(description="The key for the evaluation")
    score: int = Field(description="The score for the evaluation")
    explanation: str = Field(description="Explanation of the score")

    class Config:
        json_schema_extra = {
            "example": {
                "key": "key_reference",
                "score": 5,
                "explanation": "The response was mostly accurate and relevant, but missed some minor details."
            }
        }

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0, model="gpt-4o")
structured_llm = llm.with_structured_output(EvaluationResult)

In [None]:
# Define the evaluator function
def faithfulness_evaluator(run: Run, example: Example):
    # Extract prediction and reference outputs
    prediction = run.outputs.get("answer") or ""
    reference = run.outputs.get("retrieved_content") or ""

    # Define the prompt template
    prompt = f"""
    You are evaluating the following: 'faithfulness'
    Given the prediction:
    {prediction}

    And the reference:
    {reference}

    Evaluate the prediction by comparing it with the reference. 
    Provide a score between 1 and 10, where 1 is the lowest and 10 is the highest.
    The score should reflect how well the prediction matches the reference in terms of content and accuracy.
    """
    
    result = structured_llm.invoke(prompt)
    normalized_score = result.score / 10 
    return {"key": "faithfulness", "score": normalized_score}

In [None]:
# Define the evaluator function
def relevancy_evaluator(run: Run, example: Example):
    # Extract prediction and query
    prediction = run.outputs.get("answer") or ""
    query = example.inputs.get("input") or ""

    # Define the prompt template
    prompt = f"""
    You are evaluating the following: 'relevancy'
    Given the prediction:
    {prediction}

    And the initial query:
    {query}

    Evaluate how relevant the prediction is to the initial query. 
    Provide a score between 1 and 10, where 1 is the lowest (not relevant at all) and 10 is the highest (extremely relevant).
    The score should reflect how well the prediction addresses and answers the initial query.
    """
    
    result = structured_llm.invoke(prompt)
    normalized_score = result.score / 10 
    return {"key": "relevancy", "score": normalized_score}

In [None]:
# Define the evaluator function for context recall
def context_recall_evaluator(run: Run, example: Example):
    # Extract retrieved content and ground truth
    retrieved_content = run.outputs.get("retrieved_content") or ""
    ground_truth = example.outputs.get("response") or ""

    # Define the prompt template
    prompt = f"""
    You are evaluating the following: 'context recall'
    Given the retrieved content:
    {retrieved_content}

    And the ground truth:
    {ground_truth}

    Evaluate how well the retriever was able to recall all the relevant context. 
    Provide a score between 1 and 10, where:
    1 is the lowest (retrieved content contains no relevant information from the ground truth)
    10 is the highest (retrieved content contains all relevant information from the ground truth)
    
    The score should reflect how completely the retrieved content captures the relevant information present in the ground truth.
    """
    
    result = structured_llm.invoke(prompt)
    normalized_score = result.score / 10 
    return {"key": "context_recall", "score": normalized_score}

In [None]:
# Define the evaluator function for context precision
def context_precision_evaluator(run: Run, example: Example):
    # Extract retrieved content and query
    retrieved_content = run.outputs.get("retrieved_content") or ""
    query = example.inputs.get("input") or ""

    # Define the prompt template
    prompt = f"""
    You are evaluating the following: 'context precision'
    Given the retrieved content:
    {retrieved_content}

    And the initial query:
    {query}

    Evaluate how relevant the retrieved content is to the initial query. 
    Provide a score between 1 and 10, where:
    1 is the lowest (retrieved content is not relevant at all to the query)
    10 is the highest (retrieved content is highly relevant and precisely answers the query)
    
    The score should reflect how well the retrieved content addresses the specific information needs of the query, without including unnecessary or irrelevant information.
    """
    
    result = structured_llm.invoke(prompt)
    normalized_score = result.score / 10 
    return {"key": "context_precision", "score": normalized_score}

#### Evals

Ragas

In [None]:
dataset_name = "RAG-Evaluation"
experiment_results = evaluate(
    call_agent,
    data=dataset_name,
    evaluators=[context_recall_evaluator, context_precision_evaluator, relevancy_evaluator, faithfulness_evaluator],
    experiment_prefix="ragas-full-evaluation",

)

Answer Correctness

In [None]:
from langsmith.evaluation import LangChainStringEvaluator, evaluate

answer_correctness_evaluator = LangChainStringEvaluator(
    "labeled_score_string",
    config={
        "criteria": {
            "answer_correctness": """Is the Assistant's Answer aligned with the Ground Truth Answer? A score of [[1]] means that the
            Assistant answer contains is not at all based upon / grounded in the Groun Truth Answer. A score of [[5]] means 
            that the Assistant answer contains some information (e.g., a hallucination) that is not captured in the Ground Truth 
            Answer. A score of [[10]] means that the Assistant answer is fully based upon the in the Ground Truth Answer."""
        },
        # If you want the score to be saved on a scale from 0 to 1
        "normalize_by": 10,
        "key": "answer_correctness"
    },
    prepare_data=lambda run, example: {
        "prediction": run.outputs["answer"],
        "reference": example.outputs["response"],
        "input": example.inputs["input"],
    },
)

dataset_name = "RAG-Evaluation"
experiment_results = evaluate(
   call_agent,
    data=dataset_name,
    evaluators=[answer_correctness_evaluator],
    experiment_prefix="answer-correctness",

)