# Pairwise RAG pipeline evaluation
E2E recommendation comparing both RAG pipelines 



In [1]:
from dotenv import load_dotenv
load_dotenv()

True

## RAG1: Performant

In [3]:
from llama_index.core import Settings
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.llms.openai import OpenAI
import chromadb
from chromadb.config import Settings
from utils import load_documents, get_nodes, create_index

DOCUMENTS_PATH = "./source_documents" 
DB_PATH = '../chroma_db'
DB_COLLECTION_NAME = "insurance_policy_collection" 
COUNT_NODES_RETRIEVED = 2

# Define Chroma client
client = chromadb.PersistentClient(path=DB_PATH, settings=Settings(allow_reset=True))

# Delete existing collection if exists
client.reset()

# Define and configure embedding and generation LLMs
Settings.embed_model = OpenAIEmbedding() # Set embedding model globally to index and retrieve using the same model 
generation_llm = OpenAI()

# Create Retriever
documents = load_documents(DOCUMENTS_PATH)
nodes = get_nodes(documents)
chroma_collection = client.get_or_create_collection(DB_COLLECTION_NAME)
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = create_index(nodes, vector_store)
retriever = index.as_retriever(similarity_top_k=COUNT_NODES_RETRIEVED)

# Create Query Engine
query_engine_1 = index.as_query_engine(
    llm=generation_llm,
    similarity_top_k=COUNT_NODES_RETRIEVED
)

ValueError: An instance of Chroma already exists for ../chroma_db with different settings

## RAG2: Fast and free

In [None]:
from llama_index.core import Settings
from llama_index.core.embeddings import resolve_embed_model
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.llms.ollama import Ollama
import weaviate
from utils import load_documents, get_nodes, create_index

DOCUMENTS_PATH = "./source_documents/"
INDEX_NAME = "InsurancePolicyIndex" 
COUNT_NODES_RETRIEVED = 2
DENSE_VECTOR_HYBRID_WEIGHTING = 0.5 

# Define Weaviate client 
client = weaviate.Client(embedded_options=weaviate.EmbeddedOptions())

# Delete existing index if exists
client.schema.delete_class(INDEX_NAME)

# Define and configure embedding and generation LLMs
Settings.embed_model = resolve_embed_model("local:BAAI/bge-small-en-v1.5") # Set embedding model globally to index and retrieve using the same model 
generation_llm  = Ollama(model="gemma:2b", request_timeout=30.0)

# Create Retriever
documents = load_documents(DOCUMENTS_PATH)
nodes = get_nodes(documents)
vector_store = WeaviateVectorStore(weaviate_client=client, index_name=INDEX_NAME)
index = create_index(nodes, vector_store)
retriever = index.as_retriever()

# Configure Query Engine
query_engine_2 = index.as_query_engine(
    vector_store_query_mode = "hybrid", 
    alpha = DENSE_VECTOR_HYBRID_WEIGHTING,
    llm = generation_llm,
    similarity_top_k = COUNT_NODES_RETRIEVED, 
)

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

## Evaluation

### Example Evaluation

In [8]:
# import pandas as pd
# # define jupyter display function
# def display_pairwise_eval_df(query, response1, response2, eval_result) -> None:
#     eval_df = pd.DataFrame(
#         {
#             "Query": query,
#             "Reference Response (Answer 1)": response2,
#             "Current Response (Answer 2)": response1,
#             "Score": eval_result.score,
#             "Reason": eval_result.feedback,
#         },
#         index=[0],
#     )
#     eval_df = eval_df.style.set_properties(
#         **{
#             "inline-size": "300px",
#             "overflow-wrap": "break-word",
#         },
#         subset=["Current Response (Answer 2)", "Reference Response (Answer 1)"]
#     )
#     display(eval_df)

In [18]:
from llama_index.core.evaluation import PairwiseComparisonEvaluator
from utils import display_pairwise_eval_df

# Define evaluation LLM
evaluation_llm = OpenAI(temperature=0, model="gpt-4")

# Example Queries
query = "What insurances are available?"
response1 = str(query_engine_1.query(query))
response2 = str(query_engine_2.query(query))

# Evaluate pair of responses
pairwise_evaluator = PairwiseComparisonEvaluator(llm=evaluation_llm)
eval_result = await pairwise_evaluator.aevaluate(
    query=query, response=response1, second_response=response2
)

display_pairwise_eval_df(query, response1, response2, eval_result)

Unnamed: 0,Query,Reference Response (Answer 1),Current Response (Answer 2),Score,Reason
0,What are the problems with HN?,The problem with HN is that it was a major source of stress and a 60% chance of being the source of HN's problems.,"The issues with HN were that it caused a significant amount of stress for the individual involved, making it the biggest source of stress in their work. This stress was not directly related to the core work of selecting and helping founders, which made it a source of frustration and distraction.",1.0,"Assistant A provides a more detailed and coherent response to the user's question. It explains that the problems with HN were related to the stress it caused, which was not directly related to the core work of selecting and helping founders, making it a source of frustration and distraction. On the other hand, Assistant B's response is less clear and seems to contain a statistical error or misunderstanding (""a 60% chance of being the source of HN's problems""). Therefore, Assistant A's response is more helpful, accurate, and detailed. Final Verdict: [[A]]"


### Batch evaluation

In [None]:
from utils import (
    create_question_dataset, 
    create_prediction_dataset, 
    evaluate_tasks, 
)

# Create rag question dataset    
rag_dataset = create_question_dataset(nodes, evaluation_llm)

In [None]:
# Create prediction datasets
prediction_data_1 = await create_prediction_dataset(rag_dataset, query_engine_1)
prediction_data_2 = await create_prediction_dataset(rag_dataset, query_engine_2)

In [None]:
# Create Evaluation tasks using evaluation LLM judge
eval_tasks = []
for example, prediction_1, prediction_2 in zip(
    rag_dataset.examples, prediction_data_1.predictions, prediction_data_2.predictions
):
    eval_tasks.append(
        PairwiseComparisonEvaluator(llm=evaluation_llm).aevaluate(
            query=query, 
            response=prediction_1, 
            second_response=prediction_2
            sleep_time_in_seconds=1.5,
        )
    )

In [None]:
# Evaluate tasks to get evaluation results
eval_results = await evaluate_tasks(eval_tasks)

In [None]:
from llama_index.core.evaluation.notebook_utils import get_eval_results_df

# Viewing evaluation results
_, mean_pairwise_df = get_eval_results_df(
    ["mean value"] * len(eval_results),
    eval_results,
    metric="pairwise",
)

mean_scores_df = pd.concat(
    [
        mean_pairwise_df.reset_index(),
    ],
    axis=0,
    ignore_index=True,
)
mean_scores_df = mean_scores_df.set_index("index")
mean_scores_df.index = mean_scores_df.index.set_names(["metrics"])
mean_scores_df

rag,mean value
metrics,Unnamed: 1_level_1
mean_answer_relevancy_score,0.864865
mean_context_relevancy_score,0.728041
mean_faithfulness_score,0.513514
mean_semantic_similarity_score,0.924617
