# RAG 2: Fast and Free 
The second RAG pipeline leverages open source and local technologies, optimising for fast performance and free implementation.

In [1]:
from llama_index.core import Settings
from llama_index.core.embeddings import resolve_embed_model
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.llms.ollama import Ollama
import weaviate
from utils import load_documents, get_nodes, create_index

DOCUMENTS_PATH = "./source_documents/"
INDEX_NAME = "InsurancePolicyIndex" 
COUNT_NODES_RETRIEVED = 2
DENSE_VECTOR_HYBRID_WEIGHTING = 0.5 

# Define Weaviate client 
client = weaviate.Client(embedded_options=weaviate.EmbeddedOptions())

# Delete existing index if exists
client.schema.delete_class(INDEX_NAME)

# Define and configure embedding and generation LLMs
Settings.embed_model = resolve_embed_model("local:BAAI/bge-small-en-v1.5") # Set embedding model globally to index and retrieve using the same model 
generation_llm  = Ollama(model="gemma:2b", request_timeout=30.0)

# Create Retriever
documents = load_documents(DOCUMENTS_PATH)
nodes = get_nodes(documents)
vector_store = WeaviateVectorStore(weaviate_client=client, index_name=INDEX_NAME)
index = create_index(nodes, vector_store)
retriever = index.as_retriever()

# Configure Query Engine
query_engine = index.as_query_engine(
    vector_store_query_mode = "hybrid", 
    alpha = DENSE_VECTOR_HYBRID_WEIGHTING,
    llm = generation_llm,
    similarity_top_k = COUNT_NODES_RETRIEVED, 
)

Started /Users/mzhao/.cache/weaviate-embedded: process ID 22069


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-04-15T21:58:01+10:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-04-15T21:58:01+10:00"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-04-15T21:58:01+10:00"}
{"action":"grpc_startup","level":"info","msg":"grpc server listening at [::]:50060","time":"2024-04-15T21:58:01+10:00"}
{"action":"restapi_management","level":"info","msg":"Serving weaviate at http://127.0.0.1:8079","time":"2024-04-15T21:58:01+10:00"}
            Please consider upgrading to the latest version. See https://weaviate.io/developers/weaviate/client-libraries/python for details.
{"action":"

In [2]:
query_engine.query("What types of insurance are available?")

Sure, the types of insurance available are:

- Comprehensive Plus Insurance
- Comprehensive Insurance
- Third Party Fire & Theft Insurance
- Third Party Property Damage Insurance


# Pipeline Evaluation

In [3]:
# Load OPENAI_API_KEY from .env file for evaluation
from dotenv import load_dotenv
load_dotenv()

True

### Retreival Evaluation

In [4]:
import nest_asyncio
from llama_index.core.evaluation import RetrieverEvaluator
from llama_index.llms.openai import OpenAI
from utils import display_retrieval_evaluation_results, create_retrieval_qa_dataset

# Allows for nested async calls in Jupyter notebooks
nest_asyncio.apply()

# Define evaluation LLM
evaluation_llm = OpenAI(temperature=0, model="gpt-4")

# Create QA dataset
QA_DATASET_PATH = "./qa_datasets/qa_dataset_2.json"
qa_dataset = create_retrieval_qa_dataset(nodes, evaluation_llm, QA_DATASET_PATH)

# Evaluate QA dataset
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=retriever
)
eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)
display_retrieval_evaluation_results(f"top-{COUNT_NODES_RETRIEVED} eval", eval_results)

Previous retrieval evaluation dataset deleted successfully.


  0%|          | 0/189 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 189/189 [04:22<00:00,  1.39s/it]


Unnamed: 0,retrievers,hit_rate,mrr
0,top-2 eval,0.624339,0.542328


### Generation Evaluation

#### Example evaluation

In [2]:
from llama_index.core.evaluation import FaithfulnessEvaluator, AnswerRelevancyEvaluator, ContextRelevancyEvaluator

# Define evaluation LLM
evaluation_llm = OpenAI() # Ideally this should be a superior model to generation_llm (e.g. GPT-4), however due to rate limits on lower usage tiers, GPT-3.5 is used

# Example query and response 
query = "What types of insurance are available?"
response = query_engine.query(query)
print(response)

# Example Faithfulness evaluation
faithfulness_evaluator = FaithfulnessEvaluator(llm=evaluation_llm)
eval_result = faithfulness_evaluator.evaluate_response(query=query, response=response)
print("Faithfulness: " + str(eval_result.passing))

# Example Relevancy evaluation
relevancy_evaluator = AnswerRelevancyEvaluator(llm=evaluation_llm)
eval_result = relevancy_evaluator.evaluate_response(query=query, response=response)
print("Relevance: " + str(eval_result.score))

# Example Context evaluation
context_evaluator = ContextRelevancyEvaluator(llm=evaluation_llm)
eval_result = context_evaluator.evaluate_response(query=query, response=response)
print("Context: " + str(eval_result.score))

NameError: name 'query_engine' is not defined

#### Batch evaluation

In [5]:
from utils import (
    create_question_dataset, 
    create_prediction_dataset, 
    create_judges, 
    create_evaluation_tasks, 
    evaluate_tasks, 
    display_generation_evaluation_results
)

# Create rag question dataset    
rag_dataset = create_question_dataset(nodes, evaluation_llm)

In [6]:
# Create prediction dataset
prediction_data = await create_prediction_dataset(rag_dataset, query_engine)

Batch processing of predictions: 100%|██████████| 10/10 [00:22<00:00,  2.21s/it]
Batch processing of predictions: 100%|██████████| 10/10 [00:17<00:00,  1.80s/it]
Batch processing of predictions: 100%|██████████| 10/10 [00:21<00:00,  2.19s/it]
Batch processing of predictions: 100%|██████████| 10/10 [00:21<00:00,  2.18s/it]
Batch processing of predictions: 100%|██████████| 10/10 [00:24<00:00,  2.50s/it]
Batch processing of predictions: 100%|██████████| 10/10 [00:24<00:00,  2.47s/it]
Batch processing of predictions: 100%|██████████| 10/10 [00:25<00:00,  2.58s/it]
Batch processing of predictions: 100%|██████████| 10/10 [00:26<00:00,  2.61s/it]
Batch processing of predictions: 100%|██████████| 10/10 [00:26<00:00,  2.60s/it]
Batch processing of predictions: 100%|██████████| 10/10 [00:21<00:00,  2.15s/it]
Batch processing of predictions: 100%|██████████| 10/10 [00:23<00:00,  2.32s/it]
Batch processing of predictions: 100%|██████████| 10/10 [00:24<00:00,  2.42s/it]
Batch processing of predicti

In [7]:
# Create Evaluation tasks using evaluation LLM judge
judges = create_judges(evaluation_llm)
eval_tasks = create_evaluation_tasks(rag_dataset, prediction_data, judges)

In [8]:
# Evaluate tasks to get evaluation results
eval_results = await evaluate_tasks(eval_tasks)

Retrying llama_index.llms.openai.base.OpenAI._achat in 0.5434413177537917 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo in organization org-M4tHPAmLSbtutEWD7DSkmcHM on tokens per min (TPM): Limit 60000, Used 59262, Requested 1040. Please try again in 302ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}.
Retrying llama_index.llms.openai.base.OpenAI._achat in 0.5826342051111667 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo in organization org-M4tHPAmLSbtutEWD7DSkmcHM on tokens per min (TPM): Limit 60000, Used 59577, Requested 1353. Please try again in 930ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}.
Retrying llama_index.llms.openai.base.OpenAI._achat in 0.882263269603150

In [9]:
# Display evaluation results
display_generation_evaluation_results(eval_results)

rag,mean value
metrics,Unnamed: 1_level_1
mean_answer_relevancy_score,0.90873
mean_context_relevancy_score,0.844164
mean_faithfulness_score,0.835979
mean_semantic_similarity_score,0.893557
