In [1]:
import import_ipynb
from advanced_rag import *
import json
from ragas import EvaluationDataset, evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness
import pandas as pd
import numpy as np

llm.model_name: gpt-3.5-turbo


## Creating necessery variables such as vectore base and declaring a llm

In [2]:
db, llm = initialize_rag(model_choice="gpt")

No new documents to add.


## Create Evaluation Dataset

We are building a dataset for RAGAS evaluation using questions and ground truth answers from `qa_eval.json`.

For each entry, we will include:
- **user_input**: the question
- **response**: the answer generated by the LLM using retrieved context
- **retrieved_contexts**: the documents returned by the retriever
- **reference**: the ground truth answer

In [3]:
def load_eval_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        eval_data = json.load(file)
        queries = [entry["question"] for entry in eval_data]
        ground_truth = [entry["ground_truth"] for entry in eval_data]
    return queries, ground_truth

def create_evaluation_dataset(answers, contexts, queries, ground_truth):
    eval_dataset = []
    for i in range(len(queries)):
        eval_dataset.append({
            "user_input": queries[i],
            "response": answers[i],
            "retrieved_contexts": contexts[i],
            "reference": ground_truth[i]
        })
    return eval_dataset

def extract_and_print_contexts(result):
    contexts = [doc.page_content for doc in result.get("source_documents", [])]
    return contexts

def run_queries(rag_chain, file_path):
    results = []
    contexts = []
    queries, ground_truth = load_eval_data(file_path)
    for query in queries:
        result = rag_chain({"question": query})
        results.append(result['answer'])
        contexts.append(extract_and_print_contexts(result))
    return create_evaluation_dataset(results, contexts, queries, ground_truth)

In [4]:
def evaluate_dataset(evaluation_dataset, llm):
    evaluator_llm = LangchainLLMWrapper(llm)
    print(type(evaluator_llm))
    print(evaluator_llm)
    result = evaluate(
        dataset=evaluation_dataset,
        metrics=[
            LLMContextRecall(), 
            Faithfulness(), 
            FactualCorrectness()
        ],
        llm=evaluator_llm,
    )
    return result

In [5]:
def eval_dataset_to_df(dataset):
    return pd.DataFrame([
        {
            "Question": entry["user_input"],
            "Response": entry["response"],
            "Context (1st)": entry["retrieved_contexts"][0] if entry["retrieved_contexts"] else "",
            "Ground Truth": entry["reference"],
        }
        for entry in dataset
    ])

In [6]:
def evaluation_result_to_df(results) -> pd.DataFrame:
    scores_dict = results._scores_dict
    df = pd.DataFrame(scores_dict).T
    df.columns = [f"q{i+1}" for i in range(df.shape[1])]
    df["avg"] = df.mean(axis=1).round(4)
    return df

In [7]:
rag_chain, rag_memory = create_rag_chain(
    vectorstore=db,
    llm=llm,
    prompt_type='zero_shot',
    use_memory=True
    )



In [8]:
file_path = 'qa_eval.json'
dataset = run_queries(rag_chain, file_path)
eval_dataset = EvaluationDataset.from_list(dataset)
results = evaluate_dataset(eval_dataset, llm)
print(results)

  result = rag_chain({"question": query})


<class 'ragas.llms.base.LangchainLLMWrapper'>
LangchainLLMWrapper(langchain_llm=ChatOpenAI(...))


Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

{'context_recall': 1.0000, 'faithfulness': 0.7778, 'factual_correctness(mode=f1)': 0.6700}


In [9]:
df1 = evaluation_result_to_df(results)
df1

Unnamed: 0,q1,q2,q3,avg
context_recall,1.0,1.0,1.0,1.0
faithfulness,1.0,0.333333,1.0,0.7778
factual_correctness(mode=f1),0.67,0.67,0.67,0.67


In [10]:
rag_chain, rag_memory = create_rag_chain(
    vectorstore=db,
    llm=llm,
    prompt_type='cot',
    use_memory=True,
    use_reranking=True,
    additional_prompt_instruction="provide only final answer"
    )

In [11]:
dataset = run_queries(rag_chain, file_path)
eval_dataset = EvaluationDataset.from_list(dataset)
results = evaluate_dataset(eval_dataset, llm)
print(results)

<class 'ragas.llms.base.LangchainLLMWrapper'>
LangchainLLMWrapper(langchain_llm=ChatOpenAI(...))


Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

{'context_recall': 1.0000, 'faithfulness': 0.6111, 'factual_correctness(mode=f1)': 0.5567}


In [12]:
df2 = evaluation_result_to_df(results)
df2

Unnamed: 0,q1,q2,q3,avg
context_recall,1.0,1.0,1.0,1.0
faithfulness,0.333333,1.0,0.5,0.6111
factual_correctness(mode=f1),1.0,0.67,0.0,0.5567


In [13]:
rag_chain, rag_memory = create_rag_chain(
    vectorstore=db,
    llm=llm,
    prompt_type='react',
    use_memory=True,
    use_reranking=True,
    rewrite=True,
    additional_prompt_instruction="provide only final answer"
    )

In [14]:
dataset = run_queries(rag_chain, file_path)
eval_dataset = EvaluationDataset.from_list(dataset)
results = evaluate_dataset(eval_dataset, llm)
print(results)

Rewritten query: content='Can you provide detailed information on the characteristics and gameplay significance of Pokémon EX cards in the Pokémon Trading Card Game?' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 23, 'prompt_tokens': 85, 'total_tokens': 108, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-BO98QtlAjqhEkbKFNDcvK3xoJcX9c', 'finish_reason': 'stop', 'logprobs': None} id='run-70b4925c-55a4-41a1-83e9-30404faea099-0' usage_metadata={'input_tokens': 85, 'output_tokens': 23, 'total_tokens': 108, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}
Rewritten content: Can you provide detailed information on the characteristics and gameplay signific

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

{'context_recall': 1.0000, 'faithfulness': 0.7000, 'factual_correctness(mode=f1)': 0.8700}


In [15]:
df3 = evaluation_result_to_df(results)
df3

Unnamed: 0,q1,q2,q3,avg
context_recall,1.0,1.0,1.0,1.0
faithfulness,0.6,0.5,1.0,0.7
factual_correctness(mode=f1),1.0,0.75,0.86,0.87


## SUMMARY

In [27]:
# Merge the dataframes
merged_df = pd.concat([df1, df2, df3], axis=1)

# Create MultiIndex with labels for each group
n_cols = df1.shape[1]  # number of columns per df

merged_df.columns = pd.MultiIndex.from_tuples(
    [("Simple RAG + Zero-Shot", col) for col in df1.columns] +
    [("Reranking + CoT", col) for col in df2.columns] +
    [("Reranking, Query Rewriting + React", col) for col in df3.columns]
)

merged_df

Unnamed: 0_level_0,Simple RAG + Zero-Shot,Simple RAG + Zero-Shot,Simple RAG + Zero-Shot,Simple RAG + Zero-Shot,Reranking + CoT,Reranking + CoT,Reranking + CoT,Reranking + CoT,"Reranking, Query Rewriting + React","Reranking, Query Rewriting + React","Reranking, Query Rewriting + React","Reranking, Query Rewriting + React"
Unnamed: 0_level_1,q1,q2,q3,avg,q1,q2,q3,avg,q1,q2,q3,avg
context_recall,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
faithfulness,1.0,0.333333,1.0,0.7778,0.333333,1.0,0.5,0.6111,0.6,0.5,1.0,0.7
factual_correctness(mode=f1),0.67,0.67,0.67,0.67,1.0,0.67,0.0,0.5567,1.0,0.75,0.86,0.87


In [28]:
final_df = merged_df.T.groupby(level=0).mean().T

final_df

Unnamed: 0,Reranking + CoT,"Reranking, Query Rewriting + React",Simple RAG + Zero-Shot
context_recall,1.0,1.0,1.0
faithfulness,0.611108,0.7,0.777783
factual_correctness(mode=f1),0.556675,0.87,0.67
