# Evaluating AI Agents

Please read the full article at [thedataguy.pro](https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/).


In [1]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [6]:
from ragas.metrics import AgentGoalAccuracyWithoutReference, ToolCallAccuracy, TopicAdherenceScore
from ragas.evaluation import EvaluationDataset
from ragas.dataset_schema import MultiTurnSample
from langchain_openai import ChatOpenAI
from ragas.llms import LangchainLLMWrapper

# Initialize the LLM
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))

# Example conversation with a travel booking agent
test_data = {
    "user_input": [
        {"role": "user", "content": "I need to book a flight from New York to London next Friday"},
        {"role": "assistant", "content": "I'd be happy to help you book a flight. Let me search for options...", 
         "tool_calls": [{"name": "search_flights", "arguments": {"origin": "NYC", "destination": "LON", "date": "next Friday"}}]},
        {"role": "tool", "name": "search_flights", "content": "Found 5 flights: Flight 1 (Delta, $750), Flight 2 (British Airways, $820)..."},
        {"role": "assistant", "content": "I found several flights from New York to London next Friday. The cheapest option is Delta for $750. Would you like to book this one?"},
        {"role": "user", "content": "Yes, please book the Delta flight"},
        {"role": "assistant", "content": "I'll book that for you now.", 
         "tool_calls": [{"name": "book_flight", "arguments": {"flight_id": "delta_123", "price": "$750"}}]},
        {"role": "tool", "name": "book_flight", "content": "Booking confirmed. Confirmation #: ABC123"},
        {"role": "assistant", "content": "Great news! Your flight is confirmed. Your confirmation number is ABC123. The flight is scheduled for next Friday. Is there anything else you need help with?"}
    ],
    "reference_topics": ["travel", "flight booking", "schedules", "prices"],
    "reference_tool_calls": [
        {"name": "search_flights", "args": {"origin": "NYC", "destination": "LON", "date": "next Friday"}},
        {"name": "book_flight", "args": {"flight_id": "delta_123", "price": "$750"}}
    ]
}

# Create a sample
sample = MultiTurnSample(**test_data)

# Initialize metrics
goal_accuracy = AgentGoalAccuracyWithoutReference(llm=evaluator_llm)
tool_accuracy = ToolCallAccuracy()
topic_adherence = TopicAdherenceScore(llm=evaluator_llm)

# Calculate scores
goal_score = await goal_accuracy.multi_turn_ascore(sample)
tool_score = tool_accuracy.multi_turn_score(sample)
topic_score = await topic_adherence.multi_turn_ascore(sample)

print(f"Goal Accuracy: {goal_score}")
print(f"Tool Call Accuracy: {tool_score}")
print(f"Topic Adherence: {topic_score}")



Goal Accuracy: 1.0
Tool Call Accuracy: 0.0
Topic Adherence: 0.4999999999375
