In [1]:
from dotenv import load_dotenv
load_dotenv()

True

# RAG 1: Performant
The first RAG pipeline uses best-in-class embedding and generation models, optimising for retrieval and generation accuracy.

In [2]:
from llama_index.core import Settings
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.llms.openai import OpenAI
import chromadb
from chromadb.config import Settings
from utils import load_documents, get_nodes, create_index

DOCUMENTS_PATH = "./source_documents/"
DB_PATH = '../chroma_db'
DB_COLLECTION_NAME = "insurance_policy_collection"
COUNT_NODES_RETRIEVED = 2

# Define Chroma client
client = chromadb.PersistentClient(path=DB_PATH, settings=Settings(allow_reset=True))

# Delete existing collection if exists
client.reset()

# Define and configure embedding and generation LLMs
Settings.embed_model = OpenAIEmbedding() # Set embedding model globally to index and retrieve using the same model 
generation_llm = OpenAI()

# Create Retriever
documents = load_documents(DOCUMENTS_PATH)
nodes = get_nodes(documents)
chroma_collection = client.get_or_create_collection(DB_COLLECTION_NAME)
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = create_index(nodes, vector_store)
retriever = index.as_retriever(similarity_top_k=COUNT_NODES_RETRIEVED)

# Create Query Engine
query_engine = index.as_query_engine(
    llm=generation_llm,
    similarity_top_k=COUNT_NODES_RETRIEVED
)

In [3]:
query_engine.query("What types of insurance are available?")

Response(response='Comprehensive Plus Insurance, Comprehensive Insurance, Third Party Fire & Theft Insurance, and Third Party Property Damage Insurance are the types of insurance available.', source_nodes=[NodeWithScore(node=TextNode(id_='8cfb425c-301a-4470-8e05-675c45369e08', embedding=None, metadata={'page_label': '8', 'file_name': 'nrma-car-pds-1023-east.pdf', 'file_path': '/Users/mzhao/sei/rai-assignment/rai_assignment/source_documents/nrma-car-pds-1023-east.pdf', 'file_type': 'application/pdf', 'file_size': 454985, 'creation_date': '2024-04-11', 'last_modified_date': '2024-04-09'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='fe7ba71c-016d-40b5-99cf-d9f7c0010bfa', node_type=<ObjectType.DOCUMENT: '4'

# Pipeline Evaluation

### Retreival Evaluation

In [4]:
import nest_asyncio
from llama_index.core.evaluation import RetrieverEvaluator
from llama_index.llms.openai import OpenAI
from utils import display_retrieval_evaluation_results, create_retrieval_qa_dataset

# Allows for nested async calls in Jupyter notebooks
nest_asyncio.apply()

# Define evaluation LLM
evaluation_llm = OpenAI(temperature=0, model="gpt-4") 

# Create QA dataset
QA_DATASET_PATH = "./qa_datasets/qa_dataset_1.json"
qa_dataset = create_retrieval_qa_dataset(nodes, evaluation_llm, QA_DATASET_PATH)

# Evaluate QA dataset
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=retriever
)
eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)
display_retrieval_evaluation_results(f"top-{COUNT_NODES_RETRIEVED} eval", eval_results)

Retrieval evaluation dataset does not exist. Creating one now...


100%|██████████| 189/189 [14:20<00:00,  4.56s/it]


Unnamed: 0,retrievers,hit_rate,mrr
0,top-2 eval,0.63172,0.553763


### Generation Evaluation

#### Example evaluation

In [5]:
from llama_index.core.evaluation import FaithfulnessEvaluator, AnswerRelevancyEvaluator, ContextRelevancyEvaluator

# Define evaluation LLM
evaluation_llm = OpenAI() # Ideally this should be a superior model to generation_llm (e.g. GPT-4), however due to rate limits on lower usage tiers, GPT-3.5 is used

# Example query and response 
query = "What types of insurance are available?"
response = query_engine.query(query)
print(response)

# Example Faithfulness evaluation
faithfulness_evaluator = FaithfulnessEvaluator(llm=evaluation_llm)
eval_result = faithfulness_evaluator.evaluate_response(query=query, response=response)
print("Faithfulness: " + str(eval_result.passing))

# Example Relevancy evaluation
relevancy_evaluator = AnswerRelevancyEvaluator(llm=evaluation_llm)
eval_result = relevancy_evaluator.evaluate_response(query=query, response=response)
print("Relevance: " + str(eval_result.score))

# Example Context evaluation
context_evaluator = ContextRelevancyEvaluator(llm=evaluation_llm)
eval_result = context_evaluator.evaluate_response(query=query, response=response)
print("Context: " + str(eval_result.score))

Comprehensive Plus Insurance, Comprehensive Insurance, Third Party Fire & Theft Insurance, and Third Party Property Damage Insurance are the types of insurance available.
Faithfulness: True
Relevance: 1.0
Context: 1.0


#### Batch evaluation

In [6]:
from utils import (
    create_question_dataset, 
    create_prediction_dataset, 
    create_judges, 
    create_evaluation_tasks, 
    evaluate_tasks, 
    display_generation_evaluation_results
)

# Create rag question dataset    
rag_dataset = create_question_dataset(nodes, evaluation_llm)

In [7]:
# Create prediction dataset
prediction_data = await create_prediction_dataset(rag_dataset, query_engine)

Batch processing of predictions: 100%|██████████| 10/10 [00:04<00:00,  2.05it/s]
Batch processing of predictions: 100%|██████████| 10/10 [00:05<00:00,  1.99it/s]
Batch processing of predictions: 100%|██████████| 10/10 [00:05<00:00,  1.87it/s]
Batch processing of predictions: 100%|██████████| 10/10 [00:04<00:00,  2.01it/s]
Batch processing of predictions: 100%|██████████| 10/10 [00:06<00:00,  1.64it/s]
Batch processing of predictions: 100%|██████████| 10/10 [00:04<00:00,  2.20it/s]
Batch processing of predictions: 100%|██████████| 10/10 [00:04<00:00,  2.16it/s]
Batch processing of predictions: 100%|██████████| 10/10 [00:04<00:00,  2.12it/s]
Batch processing of predictions: 100%|██████████| 10/10 [00:06<00:00,  1.65it/s]
Batch processing of predictions: 100%|██████████| 10/10 [00:04<00:00,  2.08it/s]
Batch processing of predictions: 100%|██████████| 10/10 [00:05<00:00,  1.85it/s]
Batch processing of predictions:  50%|█████     | 5/10 [00:04<00:02,  1.87it/s]Retrying llama_index.llms.open

In [8]:
# Create Evaluation tasks using evaluation LLM judge
judges = create_judges(evaluation_llm)
eval_tasks = create_evaluation_tasks(rag_dataset, prediction_data, judges)

In [9]:
# Evaluate tasks to get evaluation results
eval_results = await evaluate_tasks(eval_tasks)

Retrying llama_index.llms.openai.base.OpenAI._achat in 0.5509140349338266 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo in organization org-M4tHPAmLSbtutEWD7DSkmcHM on tokens per min (TPM): Limit 60000, Used 59753, Requested 1330. Please try again in 1.083s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}.
Retrying llama_index.llms.openai.base.OpenAI._achat in 0.9820748420327329 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo in organization org-M4tHPAmLSbtutEWD7DSkmcHM on tokens per min (TPM): Limit 60000, Used 59839, Requested 1213. Please try again in 1.052s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}.
Retrying llama_index.llms.openai.base.OpenAI._achat in 0.3630901455965

BadRequestError: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}

In [None]:
# Display evaluation results
display_generation_evaluation_results(eval_results)