**EVALUATION FOR RAG SYSTEM**

In [2]:
# Importing the necessary libraries
import ast
import pandas as pd
import os
import openai
os.environ["OPENAI_API_KEY"] = "API_Key"

from ragas import evaluate
from ragas import SingleTurnSample, EvaluationDataset
from ragas.metrics import context_recall, faithfulness, answer_correctness

In [10]:
# Load the test_dataset
test_data = pd.read_csv("RAG_test_dataset_1_5_rows.csv")

# Convert retrieved_docs from string to list (if needed)
test_data["retrieved_docs"] = test_data["retrieved_docs"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)


In [11]:
# Check if 'generated_answer', 'retrieved_docs' and 'ground_truth' have NaNs and replace
test_data['generated_answer'] = test_data['generated_answer'].fillna("No response").astype(str)
test_data['retrieved_docs'] = test_data['retrieved_docs'].apply(lambda x: x if isinstance(x, list) else ["No retrieved documents"])
test_data['ground_truth'] = test_data['ground_truth'].fillna("No reference available").astype(str)

test_data.head()

Unnamed: 0,query,generated_answer,retrieved_docs,ground_truth
0,Abbess,No response,[Level:\n4\nISCO_08_Code:\n2636\nTitle_EN:\nRe...,2636
1,Abbot,2636,[Level:\n4\nISCO_08_Code:\n2636\nTitle_EN:\nRe...,2636
2,"Academic, university: head of department or fa...",1345,[Level:\n4\nISCO_08_Code:\n2310\nTitle_EN:\nUn...,1345
3,"Academic, university: lecturer",2310,[Level:\n4\nISCO_08_Code:\n2310\nTitle_EN:\nUn...,2310
4,"Accessioner, library",2622,[Level:\n4\nISCO_08_Code:\n4411\nTitle_EN:\nLi...,3433


### Evaluation
Metrics: (context_recall, faithfulness, answer_correctness)

In [14]:
%%time
# Create SingleTurnSample instances for each row in the dataset
samples = []

for i, row in test_data.iterrows():
    sample = SingleTurnSample(
        user_input=row['query'],
        retrieved_contexts=row['retrieved_docs'],
        response=row['generated_answer'],
        reference=row['ground_truth'],
    )
    samples.append(sample)

# Create an EvaluationDataset
evaluation_dataset = EvaluationDataset(samples=samples)

# Evaluate using Ragas
results = evaluate(
    evaluation_dataset,
    metrics=[context_recall, faithfulness, answer_correctness]
)

# Saving the evaluation results to a csv
evaluation_result_df = results.to_pandas()
evaluation_result_df.to_csv('evaluation_result_1_5_rows_df.csv')

# Display the results
print(results)

Evaluating:   0%|          | 0/15 [00:00<?, ?it/s]

{'context_recall': 1.0000, 'faithfulness': 0.6000, 'answer_correctness': 0.6814}
CPU times: user 1.15 s, sys: 49.3 ms, total: 1.19 s
Wall time: 17.4 s
