Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bootstraprag/templates/llamaindex/rag_with_flare/.env
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,5 @@ NOTSET = 0

LIT_SERVER_PORT=8000
LIT_SERVER_WORKERS_PER_DEVICE=4

IS_EVALUATION_NEEDED=true
6 changes: 5 additions & 1 deletion bootstraprag/templates/llamaindex/rag_with_flare/base_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# enable if you are using openai
# from llama_index.llms.openai import OpenAI
from llama_index.core.base.response.schema import Response, StreamingResponse, AsyncStreamingResponse, PydanticResponse
from rag_evaluator import RAGEvaluator
import qdrant_client
import logging
from dotenv import load_dotenv, find_dotenv
Expand Down Expand Up @@ -59,9 +60,10 @@ def __init__(self, data_path: str, chunk_size: int = 512, chunk_overlap: int = 2
logger.info("initializing the global settings")
Settings.embed_model = embed_model
Settings.llm = llm

Settings.transformations = [self.text_parser]

self.rag_evaluator = RAGEvaluator()

self.text_chunks = []
self.doc_ids = []
self.nodes = []
Expand Down Expand Up @@ -116,6 +118,8 @@ def _create_index_and_retriever(self):
def query(self, query_string: str) -> RESPONSE_TYPE:
try:
response = self.flare_query_engine.query(str_or_query_bundle=query_string)
if os.environ.get('IS_EVALUATION_NEEDED') == 'true':
self.rag_evaluator.evaluate(user_query=query_string, response_obj=response)
return response
except Exception as e:
logger.error(f'Error while inference: {e}')
34 changes: 34 additions & 0 deletions bootstraprag/templates/llamaindex/rag_with_flare/rag_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from deepeval.integrations.llama_index import (
DeepEvalFaithfulnessEvaluator,
DeepEvalAnswerRelevancyEvaluator,
DeepEvalContextualRelevancyEvaluator
)
from dotenv import load_dotenv, find_dotenv
from typing import Any
import os
import logging

_ = load_dotenv(find_dotenv())
logging.basicConfig(level=int(os.environ['INFO']))
logger = logging.getLogger(__name__)


class RAGEvaluator:
def __init__(self):
self.faithfulness_evaluator = DeepEvalFaithfulnessEvaluator()
self.answer_relevancy_evaluator = DeepEvalAnswerRelevancyEvaluator()
self.context_relevancy_evaluator = DeepEvalContextualRelevancyEvaluator()

def evaluate(self, user_query: str, response_obj: Any):
logger.info(f"calling evaluation, user_query: {user_query}, response_obj: {response_obj}")
retrieval_context = [node.get_content() for node in response_obj.source_nodes]
actual_output = response_obj.response
faithfulness_evaluation_response = self.faithfulness_evaluator.evaluate(query=user_query, response=actual_output,
contexts=retrieval_context)
answer_relevancy_response = self.answer_relevancy_evaluator.evaluate(query=user_query, response=actual_output,
contexts=retrieval_context)
context_relevancy_response = self.context_relevancy_evaluator.evaluate(query=user_query, response=actual_output,
contexts=retrieval_context)
logger.info(f"faithfulness_evaluation_response: {faithfulness_evaluation_response.score}")
logger.info(f"answer_relevancy_response: {answer_relevancy_response.score}")
logger.info(f"context_relevancy_response: {context_relevancy_response.score}")
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ llama-index-embeddings-ollama==0.1.2
llama-index-vector-stores-qdrant==0.2.14
pydantic==2.9.0
litserve==0.2.2
deepeval==1.3.2
2 changes: 2 additions & 0 deletions bootstraprag/templates/llamaindex/rag_with_hyde/.env
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,5 @@ NOTSET = 0

LIT_SERVER_PORT=8000
LIT_SERVER_WORKERS_PER_DEVICE=4

IS_EVALUATION_NEEDED=true
6 changes: 5 additions & 1 deletion bootstraprag/templates/llamaindex/rag_with_hyde/base_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.indices.query.query_transform import HyDEQueryTransform
from llama_index.core.base.response.schema import Response, StreamingResponse, AsyncStreamingResponse, PydanticResponse
from rag_evaluator import RAGEvaluator
import qdrant_client
import logging
from dotenv import load_dotenv, find_dotenv
Expand Down Expand Up @@ -62,9 +63,10 @@ def __init__(self, data_path: str, chunk_size: int = 512, chunk_overlap: int = 2
logger.info("initializing the global settings")
Settings.embed_model = embed_model
Settings.llm = llm

Settings.transformations = [self.text_parser]

self.rag_evaluator = RAGEvaluator

self.text_chunks = []
self.doc_ids = []
self.nodes = []
Expand Down Expand Up @@ -126,6 +128,8 @@ def _create_index_and_retriever(self):
def query(self, query_string: str) -> RESPONSE_TYPE:
try:
response = self.hyde_query_engine.query(str_or_query_bundle=query_string)
if os.environ.get('IS_EVALUATION_NEEDED') == 'true':
self.rag_evaluator.evaluate(user_query=query_string, response_obj=response)
return response
except Exception as e:
logger.error(f'Error while inference: {e}')
34 changes: 34 additions & 0 deletions bootstraprag/templates/llamaindex/rag_with_hyde/rag_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from deepeval.integrations.llama_index import (
DeepEvalFaithfulnessEvaluator,
DeepEvalAnswerRelevancyEvaluator,
DeepEvalContextualRelevancyEvaluator
)
from dotenv import load_dotenv, find_dotenv
from typing import Any
import os
import logging

_ = load_dotenv(find_dotenv())
logging.basicConfig(level=int(os.environ['INFO']))
logger = logging.getLogger(__name__)


class RAGEvaluator:
def __init__(self):
self.faithfulness_evaluator = DeepEvalFaithfulnessEvaluator()
self.answer_relevancy_evaluator = DeepEvalAnswerRelevancyEvaluator()
self.context_relevancy_evaluator = DeepEvalContextualRelevancyEvaluator()

def evaluate(self, user_query: str, response_obj: Any):
logger.info(f"calling evaluation, user_query: {user_query}, response_obj: {response_obj}")
retrieval_context = [node.get_content() for node in response_obj.source_nodes]
actual_output = response_obj.response
faithfulness_evaluation_response = self.faithfulness_evaluator.evaluate(query=user_query, response=actual_output,
contexts=retrieval_context)
answer_relevancy_response = self.answer_relevancy_evaluator.evaluate(query=user_query, response=actual_output,
contexts=retrieval_context)
context_relevancy_response = self.context_relevancy_evaluator.evaluate(query=user_query, response=actual_output,
contexts=retrieval_context)
logger.info(f"faithfulness_evaluation_response: {faithfulness_evaluation_response.score}")
logger.info(f"answer_relevancy_response: {answer_relevancy_response.score}")
logger.info(f"context_relevancy_response: {context_relevancy_response.score}")
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ llama-index-embeddings-openai==0.1.11
llama-index-embeddings-ollama==0.1.2
llama-index-vector-stores-qdrant==0.2.14
pydantic==2.9.0
litserve==0.2.2
litserve==0.2.2
deepeval==1.3.2
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,5 @@ NOTSET = 0

LIT_SERVER_PORT=8000
LIT_SERVER_WORKERS_PER_DEVICE=4

IS_EVALUATION_NEEDED=true
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.indices.query.query_transform import HyDEQueryTransform
from llama_index.core.base.response.schema import Response, StreamingResponse, AsyncStreamingResponse, PydanticResponse
from rag_evaluator import RAGEvaluator
import qdrant_client
import logging
from dotenv import load_dotenv, find_dotenv
Expand Down Expand Up @@ -68,9 +69,10 @@ def __init__(self, data_path: str, chunk_size: int = 512, chunk_overlap: int = 2
logger.info("initializing the global settings")
Settings.embed_model = embed_model
Settings.llm = llm

Settings.transformations = [self.text_parser]

self.rag_evaluator = RAGEvaluator()

self.text_chunks = []
self.doc_ids = []
self.nodes = []
Expand Down Expand Up @@ -132,6 +134,8 @@ def _create_index_and_retriever(self):
def query(self, query_string: str) -> RESPONSE_TYPE:
try:
response = self.hyde_query_engine.query(str_or_query_bundle=query_string)
if os.environ.get('IS_EVALUATION_NEEDED') == 'true':
self.rag_evaluator.evaluate(user_query=query_string, response_obj=response)
return response
except Exception as e:
logger.error(f'Error while inference: {e}')
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from deepeval.integrations.llama_index import (
DeepEvalFaithfulnessEvaluator,
DeepEvalAnswerRelevancyEvaluator,
DeepEvalContextualRelevancyEvaluator
)
from dotenv import load_dotenv, find_dotenv
from typing import Any
import os
import logging

_ = load_dotenv(find_dotenv())
logging.basicConfig(level=int(os.environ['INFO']))
logger = logging.getLogger(__name__)


class RAGEvaluator:
def __init__(self):
self.faithfulness_evaluator = DeepEvalFaithfulnessEvaluator()
self.answer_relevancy_evaluator = DeepEvalAnswerRelevancyEvaluator()
self.context_relevancy_evaluator = DeepEvalContextualRelevancyEvaluator()

def evaluate(self, user_query: str, response_obj: Any):
logger.info(f"calling evaluation, user_query: {user_query}, response_obj: {response_obj}")
retrieval_context = [node.get_content() for node in response_obj.source_nodes]
actual_output = response_obj.response
faithfulness_evaluation_response = self.faithfulness_evaluator.evaluate(query=user_query, response=actual_output,
contexts=retrieval_context)
answer_relevancy_response = self.answer_relevancy_evaluator.evaluate(query=user_query, response=actual_output,
contexts=retrieval_context)
context_relevancy_response = self.context_relevancy_evaluator.evaluate(query=user_query, response=actual_output,
contexts=retrieval_context)
logger.info(f"faithfulness_evaluation_response: {faithfulness_evaluation_response.score}")
logger.info(f"answer_relevancy_response: {answer_relevancy_response.score}")
logger.info(f"context_relevancy_response: {context_relevancy_response.score}")
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ llama-index-embeddings-ollama==0.3.0
llama-index-vector-stores-qdrant==0.3.0
llama-index-callbacks-arize-phoenix==0.2.1
pydantic==2.9.0
litserve==0.2.2
litserve==0.2.2
deepeval==1.3.2
2 changes: 2 additions & 0 deletions bootstraprag/templates/llamaindex/rag_with_react/.env
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,5 @@ NOTSET = 0

LIT_SERVER_PORT=8000
LIT_SERVER_WORKERS_PER_DEVICE=4

IS_EVALUATION_NEEDED=true
34 changes: 34 additions & 0 deletions bootstraprag/templates/llamaindex/rag_with_react/rag_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from deepeval.integrations.llama_index import (
DeepEvalFaithfulnessEvaluator,
DeepEvalAnswerRelevancyEvaluator,
DeepEvalContextualRelevancyEvaluator
)
from dotenv import load_dotenv, find_dotenv
from typing import Any
import os
import logging

_ = load_dotenv(find_dotenv())
logging.basicConfig(level=int(os.environ['INFO']))
logger = logging.getLogger(__name__)


class RAGEvaluator:
def __init__(self):
self.faithfulness_evaluator = DeepEvalFaithfulnessEvaluator()
self.answer_relevancy_evaluator = DeepEvalAnswerRelevancyEvaluator()
self.context_relevancy_evaluator = DeepEvalContextualRelevancyEvaluator()

def evaluate(self, user_query: str, response_obj: Any):
logger.info(f"calling evaluation, user_query: {user_query}, response_obj: {response_obj}")
retrieval_context = [node.get_content() for node in response_obj.source_nodes]
actual_output = response_obj.response
faithfulness_evaluation_response = self.faithfulness_evaluator.evaluate(query=user_query, response=actual_output,
contexts=retrieval_context)
answer_relevancy_response = self.answer_relevancy_evaluator.evaluate(query=user_query, response=actual_output,
contexts=retrieval_context)
context_relevancy_response = self.context_relevancy_evaluator.evaluate(query=user_query, response=actual_output,
contexts=retrieval_context)
logger.info(f"faithfulness_evaluation_response: {faithfulness_evaluation_response.score}")
logger.info(f"answer_relevancy_response: {answer_relevancy_response.score}")
logger.info(f"context_relevancy_response: {context_relevancy_response.score}")
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from llama_index.core.agent import ReActAgent
from llama_index.llms.ollama import Ollama
from llama_index.core.base.response.schema import Response, StreamingResponse, AsyncStreamingResponse, PydanticResponse
from rag_evaluator import RAGEvaluator
from dotenv import load_dotenv, find_dotenv
from typing import Union
import qdrant_client
Expand Down Expand Up @@ -57,6 +58,8 @@ def __init__(self, input_dir: str, similarity_top_k: int = 3, chunk_size: int =
Settings.chunk_size = chunk_size
Settings.chunk_overlap = chunk_overlap

self.rag_evaluator = RAGEvaluator()

# Create a local Qdrant vector store
logger.info("initializing the vector store related objects")
self.client: qdrant_client.QdrantClient = qdrant_client.QdrantClient(url=os.environ['DB_URL'],
Expand Down Expand Up @@ -119,6 +122,9 @@ def _create_react_agent(self):

def query(self, user_query: str) -> RESPONSE_TYPE:
try:
return self.agent.query(str_or_query_bundle=user_query)
response = self.agent.query(str_or_query_bundle=user_query)
if os.environ.get('IS_EVALUATION_NEEDED') == 'true':
self.rag_evaluator.evaluate(user_query=user_query, response_obj=response)
return response
except Exception as e:
logger.error(f'Error while generating response: {e}')
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ llama-index-embeddings-ollama==0.3.0
llama-index-vector-stores-qdrant==0.3.0
qdrant-client==1.11.1
pydantic==2.9.0
litserve==0.2.2
litserve==0.2.2
deepeval==1.3.2
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,5 @@ NOTSET = 0

LIT_SERVER_PORT=8000
LIT_SERVER_WORKERS_PER_DEVICE=4

IS_EVALUATION_NEEDED=true
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from deepeval.integrations.llama_index import (
DeepEvalFaithfulnessEvaluator,
DeepEvalAnswerRelevancyEvaluator,
DeepEvalContextualRelevancyEvaluator
)
from dotenv import load_dotenv, find_dotenv
from typing import Any
import os
import logging

_ = load_dotenv(find_dotenv())
logging.basicConfig(level=int(os.environ['INFO']))
logger = logging.getLogger(__name__)


class RAGEvaluator:
def __init__(self):
self.faithfulness_evaluator = DeepEvalFaithfulnessEvaluator()
self.answer_relevancy_evaluator = DeepEvalAnswerRelevancyEvaluator()
self.context_relevancy_evaluator = DeepEvalContextualRelevancyEvaluator()

def evaluate(self, user_query: str, response_obj: Any):
logger.info(f"calling evaluation, user_query: {user_query}, response_obj: {response_obj}")
retrieval_context = [node.get_content() for node in response_obj.source_nodes]
actual_output = response_obj.response
faithfulness_evaluation_response = self.faithfulness_evaluator.evaluate(query=user_query, response=actual_output,
contexts=retrieval_context)
answer_relevancy_response = self.answer_relevancy_evaluator.evaluate(query=user_query, response=actual_output,
contexts=retrieval_context)
context_relevancy_response = self.context_relevancy_evaluator.evaluate(query=user_query, response=actual_output,
contexts=retrieval_context)
logger.info(f"faithfulness_evaluation_response: {faithfulness_evaluation_response.score}")
logger.info(f"answer_relevancy_response: {answer_relevancy_response.score}")
logger.info(f"context_relevancy_response: {context_relevancy_response.score}")
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from llama_index.core.agent import ReActAgent
from llama_index.llms.ollama import Ollama
from llama_index.core.base.response.schema import Response, StreamingResponse, AsyncStreamingResponse, PydanticResponse
from rag_evaluator import RAGEvaluator
from dotenv import load_dotenv, find_dotenv
from typing import Union
import llama_index
Expand Down Expand Up @@ -63,6 +64,8 @@ def __init__(self, input_dir: str, similarity_top_k: int = 3, chunk_size: int =
Settings.chunk_size = chunk_size
Settings.chunk_overlap = chunk_overlap

self.rag_evaluator = RAGEvaluator()

# Create a local Qdrant vector store
logger.info("initializing the vector store related objects")
self.client: qdrant_client.QdrantClient = qdrant_client.QdrantClient(url=os.environ['DB_URL'],
Expand Down Expand Up @@ -128,6 +131,9 @@ def _create_react_agent(self):

def query(self, user_query: str) -> RESPONSE_TYPE:
try:
return self.agent.query(str_or_query_bundle=user_query)
response = self.agent.query(str_or_query_bundle=user_query)
if os.environ.get('IS_EVALUATION_NEEDED') == 'true':
self.rag_evaluator.evaluate(user_query=user_query, response_obj=response)
return response
except Exception as e:
logger.error(f'Error while generating response: {e}')
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ arize-phoenix==4.33.1
qdrant-client==1.11.1
pydantic==2.9.0
litserve==0.2.2
deepeval==1.3.2
llama-index-llms-openai==0.2.3
llama-index-llms-ollama==0.3.1
llama-index-embeddings-openai==0.2.4
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,5 @@ NOTSET = 0

LIT_SERVER_PORT=8000
LIT_SERVER_WORKERS_PER_DEVICE=4

IS_EVALUATION_NEEDED=true
Loading