Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bootstraprag/templates/llamaindex/simple_rag/.env
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,5 @@ NOTSET = 0

LIT_SERVER_PORT=8000
LIT_SERVER_WORKERS_PER_DEVICE=4

IS_EVALUATION_NEEDED=true
34 changes: 34 additions & 0 deletions bootstraprag/templates/llamaindex/simple_rag/rag_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from deepeval.integrations.llama_index import (
DeepEvalFaithfulnessEvaluator,
DeepEvalAnswerRelevancyEvaluator,
DeepEvalContextualRelevancyEvaluator
)
from dotenv import load_dotenv, find_dotenv
from typing import Any
import os
import logging

_ = load_dotenv(find_dotenv())
logging.basicConfig(level=int(os.environ['INFO']))
logger = logging.getLogger(__name__)


class RAGEvaluator:
def __init__(self):
self.faithfulness_evaluator = DeepEvalFaithfulnessEvaluator()
self.answer_relevancy_evaluator = DeepEvalAnswerRelevancyEvaluator()
self.context_relevancy_evaluator = DeepEvalContextualRelevancyEvaluator()

def evaluate(self, user_query: str, response_obj: Any):
logger.info(f"calling evaluation, user_query: {user_query}, response_obj: {response_obj}")
retrieval_context = [node.get_content() for node in response_obj.source_nodes]
actual_output = response_obj.response
faithfulness_evaluation_response = self.faithfulness_evaluator.evaluate(query=user_query, response=actual_output,
contexts=retrieval_context)
answer_relevancy_response = self.answer_relevancy_evaluator.evaluate(query=user_query, response=actual_output,
contexts=retrieval_context)
context_relevancy_response = self.context_relevancy_evaluator.evaluate(query=user_query, response=actual_output,
contexts=retrieval_context)
logger.info(f"faithfulness_evaluation_response: {faithfulness_evaluation_response.score}")
logger.info(f"answer_relevancy_response: {answer_relevancy_response.score}")
logger.info(f"context_relevancy_response: {context_relevancy_response.score}")
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ llama-index-vector-stores-qdrant==0.3.0
qdrant-client==1.11.1
pydantic==2.9.0
litserve==0.2.2
ragas==0.1.20
deepeval==1.3.2
10 changes: 5 additions & 5 deletions bootstraprag/templates/llamaindex/simple_rag/simple_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from llama_index.llms.ollama import Ollama
from llama_index.core.base.response.schema import Response, StreamingResponse, AsyncStreamingResponse, PydanticResponse
from dotenv import load_dotenv, find_dotenv
from test_set_generator import TestSetGenerator
from rag_evaluator import RAGEvaluator
from typing import Union
import qdrant_client
import logging
Expand Down Expand Up @@ -38,7 +38,7 @@ def __init__(self, input_dir: str, similarity_top_k: int = 3, chunk_size: int =
self.query_engine_tools = []
self.show_progress = show_progress

self.test_set_generator = TestSetGenerator()
self.rag_evaluator = RAGEvaluator()

# use your prefered vector embeddings model
logger.info("initializing the OllamaEmbedding")
Expand Down Expand Up @@ -67,9 +67,6 @@ def __init__(self, input_dir: str, similarity_top_k: int = 3, chunk_size: int =

def _create_index(self):

# create an evaluation test set
self.test_set_generator.generate_test_set(input_dir=self.input_dir) # leaving defaults as is.

if self.client.collection_exists(collection_name=os.environ['COLLECTION_NAME']):
try:
self._index = VectorStoreIndex.from_vector_store(vector_store=self.vector_store)
Expand All @@ -94,4 +91,7 @@ def do_rag(self, user_query: str) -> RESPONSE_TYPE:
query_engine = self._index.as_query_engine(similarity_top_k=self.similarity_top_k)
logger.info("LLM is thinking...")
response = query_engine.query(str_or_query_bundle=user_query)
logger.info(f'response: {response}')
if os.environ.get('IS_EVALUATION_NEEDED') == 'true':
self.rag_evaluator.evaluate(user_query=user_query, response_obj=response)
return response
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from deepeval.integrations.llama_index import (
DeepEvalFaithfulnessEvaluator,
DeepEvalAnswerRelevancyEvaluator,
DeepEvalContextualRelevancyEvaluator
)
from dotenv import load_dotenv, find_dotenv
from typing import Any
import os
import logging

_ = load_dotenv(find_dotenv())
logging.basicConfig(level=int(os.environ['INFO']))
logger = logging.getLogger(__name__)


class RAGEvaluator:
def __init__(self):
self.faithfulness_evaluator = DeepEvalFaithfulnessEvaluator()
self.answer_relevancy_evaluator = DeepEvalAnswerRelevancyEvaluator()
self.context_relevancy_evaluator = DeepEvalContextualRelevancyEvaluator()

def evaluate(self, user_query: str, response_obj: Any):
logger.info(f"calling evaluation, user_query: {user_query}, response_obj: {response_obj}")
retrieval_context = [node.get_content() for node in response_obj.source_nodes]
actual_output = response_obj.response
faithfulness_evaluation_response = self.faithfulness_evaluator.evaluate(query=user_query, response=actual_output,
contexts=retrieval_context)
answer_relevancy_response = self.answer_relevancy_evaluator.evaluate(query=user_query, response=actual_output,
contexts=retrieval_context)
context_relevancy_response = self.context_relevancy_evaluator.evaluate(query=user_query, response=actual_output,
contexts=retrieval_context)
logger.info(f"faithfulness_evaluation_response: {faithfulness_evaluation_response.score}")
logger.info(f"answer_relevancy_response: {answer_relevancy_response.score}")
logger.info(f"context_relevancy_response: {context_relevancy_response.score}")
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ qdrant-client==1.11.1
arize-phoenix==4.33.1
llama-index-callbacks-arize-phoenix==0.2.1
pydantic==2.9.0
litserve==0.2.2
litserve==0.2.2
deepeval==1.3.2
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
StorageContext,
Settings
)
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core.agent import ReActAgent
from llama_index.llms.ollama import Ollama
from llama_index.core.base.response.schema import Response, StreamingResponse, AsyncStreamingResponse, PydanticResponse
from rag_evaluator import RAGEvaluator
from dotenv import load_dotenv, find_dotenv
from typing import Union
import qdrant_client
Expand Down Expand Up @@ -44,6 +44,8 @@ def __init__(self, input_dir: str, similarity_top_k: int = 3, chunk_size: int =
self.query_engine_tools = []
self.show_progress = show_progress

self.rag_evaluator = RAGEvaluator()

# use your prefered vector embeddings model
logger.info("initializing the OllamaEmbedding")
embed_model = OllamaEmbedding(model_name=os.environ['OLLAMA_EMBED_MODEL'],
Expand Down Expand Up @@ -93,4 +95,7 @@ def do_rag(self, user_query: str) -> RESPONSE_TYPE:
query_engine = self._index.as_query_engine(similarity_top_k=self.similarity_top_k)
logger.info("LLM is thinking...")
response = query_engine.query(str_or_query_bundle=user_query)
logger.info(f'response: {response}')
if os.environ.get('IS_EVALUATION_NEEDED') == 'true':
self.rag_evaluator.evaluate(user_query=user_query, response_obj=response)
return response