In [32]:
import import_ipynb
from advanced_rag import *
import json
from ragas import EvaluationDataset, evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness
import pandas as pd
import numpy as np

## Creating necessery variables such as vectore base and declaring a llm

In [11]:
db, llm = initialize_rag()

No new documents to add.


In [22]:
try:
    print("llm.model_name:", llm.model_name)
except AttributeError:
    print("llm.model:", llm.model)

llm.model_name: gpt-3.5-turbo


## Create Evaluation Dataset

We are building a dataset for RAGAS evaluation using questions and ground truth answers from `qa_eval.json`.

For each entry, we will include:
- **user_input**: the question
- **response**: the answer generated by the LLM using retrieved context
- **retrieved_contexts**: the documents returned by the retriever
- **reference**: the ground truth answer

In [13]:
def load_eval_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        eval_data = json.load(file)
        queries = [entry["question"] for entry in eval_data]
        ground_truth = [entry["ground_truth"] for entry in eval_data]
    return queries, ground_truth

def create_evaluation_dataset(answers, contexts, queries, ground_truth):
    eval_dataset = []
    for i in range(len(queries)):
        eval_dataset.append({
            "user_input": queries[i],
            "response": answers[i],
            "retrieved_contexts": contexts[i],
            "reference": ground_truth[i]
        })
    return eval_dataset

def extract_and_print_contexts(result):
    contexts = [doc.page_content for doc in result.get("source_documents", [])]
    return contexts

def run_queries(rag_chain, file_path):
    results = []
    contexts = []
    queries, ground_truth = load_eval_data(file_path)
    for query in queries:
        result = rag_chain({"question": query, "chat_history": []})
        results.append(result['answer'])
        contexts.append(extract_and_print_contexts(result))
    return create_evaluation_dataset(results, contexts, queries, ground_truth)

In [14]:
def evaluate_dataset(evaluation_dataset, llm):
    evaluator_llm = LangchainLLMWrapper(llm)
    print(type(evaluator_llm))
    print(evaluator_llm)
    result = evaluate(
        dataset=evaluation_dataset,
        metrics=[
            LLMContextRecall(), 
            Faithfulness(), 
            FactualCorrectness()
        ],
        llm=evaluator_llm,
    )
    return result

In [52]:
def eval_dataset_to_df(dataset):
    return pd.DataFrame([
        {
            "Question": entry["user_input"],
            "Response": entry["response"],
            "Context (1st)": entry["retrieved_contexts"][0] if entry["retrieved_contexts"] else "",
            "Ground Truth": entry["reference"],
        }
        for entry in dataset
    ])

In [47]:
def evaluation_result_to_df(results) -> pd.DataFrame:
    scores_dict = results._scores_dict
    df = pd.DataFrame(scores_dict).T
    df.columns = [f"q{i+1}" for i in range(df.shape[1])]
    df["avg"] = df.mean(axis=1).round(4)
    return df

In [24]:
rag_chain, rag_memory = create_rag_chain(
    vectorstore=db,
    llm=llm,
    prompt_type='react',
    use_memory=True,
    retriever_k=3,
    use_reranking=False,
    top_k_chunks=10)

In [23]:
try:
    print("llm.model_name:", llm.model_name)
except AttributeError:
    print("llm.model:", llm.model)

llm.model_name: gpt-3.5-turbo


In [25]:
file_path = 'qa_eval.json'
dataset = run_queries(rag_chain, file_path)
eval_dataset = EvaluationDataset.from_list(dataset)
results = evaluate_dataset(eval_dataset, llm)
print(results)

<class 'ragas.llms.base.LangchainLLMWrapper'>
LangchainLLMWrapper(langchain_llm=ChatOpenAI(...))


Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

{'context_recall': 1.0000, 'faithfulness': 0.8333, 'factual_correctness(mode=f1)': 0.4533}


In [48]:
df = evaluation_result_to_df(results)
df

Unnamed: 0,q1,q2,q3,avg
context_recall,1.0,1.0,1.0,1.0
faithfulness,1.0,0.5,1.0,0.8333
factual_correctness(mode=f1),0.67,0.29,0.4,0.4533


In [54]:
df = eval_dataset_to_df(dataset)
df

Unnamed: 0,Question,Response,Context (1st),Ground Truth
0,What is a Pokémon EX card?,Question: What is a Pokémon EX card?\nThought:...,28\nPokémon T rading Card Game Rules\nAPPENDIX...,Pokémon-EX cards are a special kind of card th...
1,What does haste do in Magic: The Gathering?,Question: What does haste do in Magic: The Gat...,702.8b Multiple instances of flash on the same...,Haste is a keyword that allows a creature to a...
2,How can a player build a hotel in Monopoly?,Question: How can a player build a hotel in Mo...,HOTELS...When a player has four houses on each...,A player must have four houses on every proper...
