In [None]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [14]:
import pandas as pd
from ragas import EvaluationDataset
from ragas import evaluate, RunConfig
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity

In [None]:
# Mock data for evaluation
# In a real-world scenario, you would load your evaluation data from a file or database.
data = {
    "user_input": [
        "What are the main symptoms of COVID-19?",
        "How does machine learning differ from deep learning?"
    ],
    "retrieved_contexts": [
        [
            "Common symptoms of COVID-19 include fever, cough, and fatigue. Some patients also report loss of taste or smell, body aches, and difficulty breathing.",
            "COVID-19 is caused by the SARS-CoV-2 virus and spreads primarily through respiratory droplets."
        ],
        [
            "Machine learning is a subset of AI focused on algorithms that learn from data without being explicitly programmed.",
            "Deep learning is a specialized form of machine learning using neural networks with many layers (deep neural networks)."
        ]
    ],
    "response": [
        "The main symptoms of COVID-19 include fever, cough, fatigue, and sometimes loss of taste or smell, body aches, and breathing difficulties.",
        "Machine learning is a subset of AI that focuses on algorithms learning from data, while deep learning is a specialized form of machine learning that uses deep neural networks with multiple layers."
    ],
    "reference": [
        "COVID-19 symptoms commonly include fever, dry cough, fatigue, loss of taste or smell, body aches, sore throat, and in severe cases, difficulty breathing.",
        "Machine learning is a branch of AI where systems learn from data, identify patterns, and make decisions with minimal human intervention. Deep learning is a subset of machine learning that uses neural networks with multiple layers (deep neural networks) to analyze various factors of data."
    ]
}

eval_data = pd.DataFrame(data)

# Convert to a format Ragas can use
evaluation_dataset = EvaluationDataset.from_pandas(eval_data)
evaluation_dataset


EvaluationDataset(features=['user_input', 'retrieved_contexts', 'response', 'reference'], len=2)

In [None]:
from langchain_openai import ChatOpenAI
from ragas.llms import LangchainLLMWrapper

# Initialize the LLM, you are going to OPENAI API key
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o")) 

# Define metrics to use
metrics = [
    Faithfulness(), 
    FactualCorrectness(),
    ResponseRelevancy(), 
    ContextEntityRecall(), 
    NoiseSensitivity(),
    LLMContextRecall()
]

# Run evaluation
results = evaluate(
    evaluation_dataset,
    metrics=metrics,
    llm=evaluator_llm  # Required for LLM-based metrics
)

# View results
print(results)

Evaluating: 100%|██████████| 12/12 [00:28<00:00,  2.41s/it]


{'faithfulness': 1.0000, 'factual_correctness': 0.6750, 'answer_relevancy': 0.9897, 'context_entity_recall': 0.8889, 'noise_sensitivity_relevant': 0.1667, 'context_recall': 0.5000}


In [26]:
from ragas import SingleTurnSample
from ragas.metrics import AspectCritic

# Define a specific test case
test_data = {
    "user_input": "What are quantum computers?",
    "response": "Quantum computers use quantum bits or qubits that can exist in multiple states simultaneously, unlike classical bits that can only be 0 or 1.",
    "retrieved_contexts": ["Quantum computing is a type of computation that harnesses quantum mechanical phenomena."]
}

# Create a custom evaluation metric
custom_metric = AspectCritic(
    name="quantum_accuracy", 
    llm=evaluator_llm,
    definition="Verify if the explanation of quantum computing is accurate and complete."
)

# Score the sample
sample = SingleTurnSample(**test_data)
score = await custom_metric.single_turn_ascore(sample)
print(f"Quantum accuracy score: {score}")

Quantum accuracy score: 0
