In [1]:
from dotenv import load_dotenv
load_dotenv()

True

# RAG 1: Performant
The first RAG pipeline uses best-in-class embedding and generation models, optimising for retrieval and generation accuracy.

In [2]:
from llama_index.core import Settings
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.llms.openai import OpenAI
import chromadb
from chromadb.config import Settings
from utils import load_documents, get_nodes, create_index

DOCUMENTS_PATH = "./source_documents/"
DB_PATH = '../chroma_db'
DB_COLLECTION_NAME = "insurance_policy_collection"
COUNT_NODES_RETRIEVED = 2

# Define Chroma client
client = chromadb.PersistentClient(path=DB_PATH, settings=Settings(allow_reset=True))

# Delete existing collection if exists
client.reset()

# Define and configure embedding and generation LLMs
Settings.embed_model = OpenAIEmbedding() # Set embedding model globally to index and retrieve using the same model 
generation_llm = OpenAI()

# Create Retriever
documents = load_documents(DOCUMENTS_PATH)
nodes = get_nodes(documents)
chroma_collection = client.get_or_create_collection(DB_COLLECTION_NAME)
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = create_index(nodes, vector_store)
retriever = index.as_retriever(similarity_top_k=COUNT_NODES_RETRIEVED)

# Create Query Engine
query_engine = index.as_query_engine(
    llm=generation_llm,
    similarity_top_k=COUNT_NODES_RETRIEVED
)

In [3]:
response = query_engine.query("What types of insurance are available?")
print(response)

Comprehensive Plus Insurance, Comprehensive Insurance, Third Party Fire & Theft Insurance, and Third Party Property Damage Insurance are the types of insurance available.


# Pipeline Evaluation

### Retreival Evaluation

In [4]:
import nest_asyncio
from llama_index.core.evaluation import RetrieverEvaluator
from llama_index.llms.openai import OpenAI
from utils import display_retrieval_evaluation_results, create_retrieval_qa_dataset

# Allows for nested async calls in Jupyter notebooks
nest_asyncio.apply()

# Define evaluation LLM
evaluation_llm = OpenAI(temperature=0) # Ideally this should be a superior model to generation_llm (e.g. GPT-4), however due costs and rate limits, GPT-3.5 is used

# Create QA dataset
QA_DATASET_PATH = "./qa_datasets/qa_dataset_1.json"
qa_dataset = create_retrieval_qa_dataset(nodes, evaluation_llm, QA_DATASET_PATH)

# Evaluate QA dataset
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=retriever
)
eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)
display_retrieval_evaluation_results(f"top-{COUNT_NODES_RETRIEVED} eval", eval_results)

Previous retrieval evaluation dataset deleted successfully.


100%|██████████| 187/187 [07:58<00:00,  2.56s/it]


Unnamed: 0,retrievers,hit_rate,mrr
0,top-2 eval,0.754011,0.705882


### Generation Evaluation

#### Example evaluation

In [5]:
from llama_index.core.evaluation import FaithfulnessEvaluator, AnswerRelevancyEvaluator, ContextRelevancyEvaluator

# Example query and response 
query = "What types of insurance are available?"
response = query_engine.query(query)
print(response)

# Example Faithfulness evaluation
faithfulness_evaluator = FaithfulnessEvaluator(llm=evaluation_llm)
eval_result = faithfulness_evaluator.evaluate_response(query=query, response=response)
print("Faithfulness: " + str(eval_result.passing))

# Example Relevancy evaluation
relevancy_evaluator = AnswerRelevancyEvaluator(llm=evaluation_llm)
eval_result = relevancy_evaluator.evaluate_response(query=query, response=response)
print("Relevance: " + str(eval_result.score))

# Example Context evaluation
context_evaluator = ContextRelevancyEvaluator(llm=evaluation_llm)
eval_result = context_evaluator.evaluate_response(query=query, response=response)
print("Context: " + str(eval_result.score))

Comprehensive Plus Insurance, Comprehensive Insurance, Third Party Fire & Theft Insurance, and Third Party Property Damage Insurance are the types of insurance available.
Faithfulness: True
Relevance: 1.0
Context: 0.375


#### Batch evaluation

In [None]:
from utils import (
    create_question_dataset, 
    create_prediction_dataset, 
    create_judges, 
    create_evaluation_tasks, 
    evaluate_tasks, 
    display_generation_evaluation_results
)

# Create rag question dataset    
rag_dataset = create_question_dataset(nodes, evaluation_llm)

Retrying llama_index.llms.openai.base.OpenAI._achat in 0.9761752169656213 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4 in organization org-M4tHPAmLSbtutEWD7DSkmcHM on tokens per min (TPM): Limit 10000, Used 9783, Requested 689. Please try again in 2.832s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}.
Retrying llama_index.llms.openai.base.OpenAI._achat in 0.9290574125612058 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4 in organization org-M4tHPAmLSbtutEWD7DSkmcHM on tokens per min (TPM): Limit 10000, Used 9919, Requested 683. Please try again in 3.612s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}.
Retrying llama_index.llms.openai.base.OpenAI._achat in 0.5347747126873436 seconds as it ra

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4 in organization org-M4tHPAmLSbtutEWD7DSkmcHM on tokens per min (TPM): Limit 10000, Used 9929, Requested 689. Please try again in 3.708s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

Retrying llama_index.llms.openai.base.OpenAI._achat in 1.1492840800316388 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4 in organization org-M4tHPAmLSbtutEWD7DSkmcHM on tokens per min (TPM): Limit 10000, Used 9633, Requested 704. Please try again in 2.022s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}.
Retrying llama_index.llms.openai.base.OpenAI._achat in 3.361541543165941 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4 in organization org-M4tHPAmLSbtutEWD7DSkmcHM on tokens per min (TPM): Limit 10000, Used 9742, Requested 718. Please try again in 2.76s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}.
Retrying llama_index.llms.openai.base.OpenAI._achat in 0.8584602240401082 seconds as it rais

In [None]:
# Create prediction dataset
prediction_data = await create_prediction_dataset(rag_dataset, query_engine)

Batch processing of predictions: 100%|██████████| 5/5 [00:18<00:00,  3.80s/it]
Batch processing of predictions: 100%|██████████| 5/5 [00:12<00:00,  2.57s/it]
Batch processing of predictions: 100%|██████████| 5/5 [00:12<00:00,  2.54s/it]
Batch processing of predictions: 100%|██████████| 5/5 [00:12<00:00,  2.44s/it]
Batch processing of predictions: 100%|██████████| 5/5 [00:13<00:00,  2.67s/it]
Batch processing of predictions: 100%|██████████| 5/5 [00:12<00:00,  2.44s/it]
Batch processing of predictions: 100%|██████████| 5/5 [00:12<00:00,  2.56s/it]
Batch processing of predictions: 100%|██████████| 5/5 [00:12<00:00,  2.51s/it]
Batch processing of predictions: 100%|██████████| 5/5 [00:14<00:00,  2.86s/it]
Batch processing of predictions: 100%|██████████| 5/5 [00:12<00:00,  2.44s/it]
Batch processing of predictions: 100%|██████████| 5/5 [00:11<00:00,  2.40s/it]
Batch processing of predictions: 100%|██████████| 5/5 [00:12<00:00,  2.51s/it]
Batch processing of predictions: 100%|██████████| 5/

In [None]:
# Create Evaluation tasks using evaluation LLM judge
judges = create_judges(evaluation_llm)
eval_tasks = create_evaluation_tasks(rag_dataset, prediction_data, judges)

In [None]:
# Evaluate tasks to get evaluation results
eval_results = await evaluate_tasks(eval_tasks)

In [None]:
# Display evaluation results
display_generation_evaluation_results(eval_results)

rag,mean value
metrics,Unnamed: 1_level_1
mean_answer_relevancy_score,0.986631
mean_context_relevancy_score,0.883065
mean_faithfulness_score,0.909091
mean_semantic_similarity_score,0.944905
