diff --git a/bootstraprag/templates/llamaindex/rag_with_flare/.env b/bootstraprag/templates/llamaindex/rag_with_flare/.env index 88b8230..595f232 100644 --- a/bootstraprag/templates/llamaindex/rag_with_flare/.env +++ b/bootstraprag/templates/llamaindex/rag_with_flare/.env @@ -22,3 +22,5 @@ NOTSET = 0 LIT_SERVER_PORT=8000 LIT_SERVER_WORKERS_PER_DEVICE=4 + +IS_EVALUATION_NEEDED=true diff --git a/bootstraprag/templates/llamaindex/rag_with_flare/base_rag.py b/bootstraprag/templates/llamaindex/rag_with_flare/base_rag.py index fb94208..81c3da8 100644 --- a/bootstraprag/templates/llamaindex/rag_with_flare/base_rag.py +++ b/bootstraprag/templates/llamaindex/rag_with_flare/base_rag.py @@ -16,6 +16,7 @@ # enable if you are using openai # from llama_index.llms.openai import OpenAI from llama_index.core.base.response.schema import Response, StreamingResponse, AsyncStreamingResponse, PydanticResponse +from rag_evaluator import RAGEvaluator import qdrant_client import logging from dotenv import load_dotenv, find_dotenv @@ -59,9 +60,10 @@ def __init__(self, data_path: str, chunk_size: int = 512, chunk_overlap: int = 2 logger.info("initializing the global settings") Settings.embed_model = embed_model Settings.llm = llm - Settings.transformations = [self.text_parser] + self.rag_evaluator = RAGEvaluator() + self.text_chunks = [] self.doc_ids = [] self.nodes = [] @@ -116,6 +118,8 @@ def _create_index_and_retriever(self): def query(self, query_string: str) -> RESPONSE_TYPE: try: response = self.flare_query_engine.query(str_or_query_bundle=query_string) + if os.environ.get('IS_EVALUATION_NEEDED') == 'true': + self.rag_evaluator.evaluate(user_query=query_string, response_obj=response) return response except Exception as e: logger.error(f'Error while inference: {e}') diff --git a/bootstraprag/templates/llamaindex/rag_with_flare/rag_evaluator.py b/bootstraprag/templates/llamaindex/rag_with_flare/rag_evaluator.py new file mode 100644 index 0000000..c96d77f --- /dev/null +++ b/bootstraprag/templates/llamaindex/rag_with_flare/rag_evaluator.py @@ -0,0 +1,34 @@ +from deepeval.integrations.llama_index import ( + DeepEvalFaithfulnessEvaluator, + DeepEvalAnswerRelevancyEvaluator, + DeepEvalContextualRelevancyEvaluator +) +from dotenv import load_dotenv, find_dotenv +from typing import Any +import os +import logging + +_ = load_dotenv(find_dotenv()) +logging.basicConfig(level=int(os.environ['INFO'])) +logger = logging.getLogger(__name__) + + +class RAGEvaluator: + def __init__(self): + self.faithfulness_evaluator = DeepEvalFaithfulnessEvaluator() + self.answer_relevancy_evaluator = DeepEvalAnswerRelevancyEvaluator() + self.context_relevancy_evaluator = DeepEvalContextualRelevancyEvaluator() + + def evaluate(self, user_query: str, response_obj: Any): + logger.info(f"calling evaluation, user_query: {user_query}, response_obj: {response_obj}") + retrieval_context = [node.get_content() for node in response_obj.source_nodes] + actual_output = response_obj.response + faithfulness_evaluation_response = self.faithfulness_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + answer_relevancy_response = self.answer_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + context_relevancy_response = self.context_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + logger.info(f"faithfulness_evaluation_response: {faithfulness_evaluation_response.score}") + logger.info(f"answer_relevancy_response: {answer_relevancy_response.score}") + logger.info(f"context_relevancy_response: {context_relevancy_response.score}") diff --git a/bootstraprag/templates/llamaindex/rag_with_flare/requirements.txt b/bootstraprag/templates/llamaindex/rag_with_flare/requirements.txt index a48fe28..86a1eff 100644 --- a/bootstraprag/templates/llamaindex/rag_with_flare/requirements.txt +++ b/bootstraprag/templates/llamaindex/rag_with_flare/requirements.txt @@ -8,3 +8,4 @@ llama-index-embeddings-ollama==0.1.2 llama-index-vector-stores-qdrant==0.2.14 pydantic==2.9.0 litserve==0.2.2 +deepeval==1.3.2 diff --git a/bootstraprag/templates/llamaindex/rag_with_hyde/.env b/bootstraprag/templates/llamaindex/rag_with_hyde/.env index 88b8230..595f232 100644 --- a/bootstraprag/templates/llamaindex/rag_with_hyde/.env +++ b/bootstraprag/templates/llamaindex/rag_with_hyde/.env @@ -22,3 +22,5 @@ NOTSET = 0 LIT_SERVER_PORT=8000 LIT_SERVER_WORKERS_PER_DEVICE=4 + +IS_EVALUATION_NEEDED=true diff --git a/bootstraprag/templates/llamaindex/rag_with_hyde/base_rag.py b/bootstraprag/templates/llamaindex/rag_with_hyde/base_rag.py index 44625f5..cf66e79 100644 --- a/bootstraprag/templates/llamaindex/rag_with_hyde/base_rag.py +++ b/bootstraprag/templates/llamaindex/rag_with_hyde/base_rag.py @@ -19,6 +19,7 @@ from llama_index.core.retrievers import VectorIndexRetriever from llama_index.core.indices.query.query_transform import HyDEQueryTransform from llama_index.core.base.response.schema import Response, StreamingResponse, AsyncStreamingResponse, PydanticResponse +from rag_evaluator import RAGEvaluator import qdrant_client import logging from dotenv import load_dotenv, find_dotenv @@ -62,9 +63,10 @@ def __init__(self, data_path: str, chunk_size: int = 512, chunk_overlap: int = 2 logger.info("initializing the global settings") Settings.embed_model = embed_model Settings.llm = llm - Settings.transformations = [self.text_parser] + self.rag_evaluator = RAGEvaluator + self.text_chunks = [] self.doc_ids = [] self.nodes = [] @@ -126,6 +128,8 @@ def _create_index_and_retriever(self): def query(self, query_string: str) -> RESPONSE_TYPE: try: response = self.hyde_query_engine.query(str_or_query_bundle=query_string) + if os.environ.get('IS_EVALUATION_NEEDED') == 'true': + self.rag_evaluator.evaluate(user_query=query_string, response_obj=response) return response except Exception as e: logger.error(f'Error while inference: {e}') diff --git a/bootstraprag/templates/llamaindex/rag_with_hyde/rag_evaluator.py b/bootstraprag/templates/llamaindex/rag_with_hyde/rag_evaluator.py new file mode 100644 index 0000000..c96d77f --- /dev/null +++ b/bootstraprag/templates/llamaindex/rag_with_hyde/rag_evaluator.py @@ -0,0 +1,34 @@ +from deepeval.integrations.llama_index import ( + DeepEvalFaithfulnessEvaluator, + DeepEvalAnswerRelevancyEvaluator, + DeepEvalContextualRelevancyEvaluator +) +from dotenv import load_dotenv, find_dotenv +from typing import Any +import os +import logging + +_ = load_dotenv(find_dotenv()) +logging.basicConfig(level=int(os.environ['INFO'])) +logger = logging.getLogger(__name__) + + +class RAGEvaluator: + def __init__(self): + self.faithfulness_evaluator = DeepEvalFaithfulnessEvaluator() + self.answer_relevancy_evaluator = DeepEvalAnswerRelevancyEvaluator() + self.context_relevancy_evaluator = DeepEvalContextualRelevancyEvaluator() + + def evaluate(self, user_query: str, response_obj: Any): + logger.info(f"calling evaluation, user_query: {user_query}, response_obj: {response_obj}") + retrieval_context = [node.get_content() for node in response_obj.source_nodes] + actual_output = response_obj.response + faithfulness_evaluation_response = self.faithfulness_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + answer_relevancy_response = self.answer_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + context_relevancy_response = self.context_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + logger.info(f"faithfulness_evaluation_response: {faithfulness_evaluation_response.score}") + logger.info(f"answer_relevancy_response: {answer_relevancy_response.score}") + logger.info(f"context_relevancy_response: {context_relevancy_response.score}") diff --git a/bootstraprag/templates/llamaindex/rag_with_hyde/requirements.txt b/bootstraprag/templates/llamaindex/rag_with_hyde/requirements.txt index f3a6443..3547d46 100644 --- a/bootstraprag/templates/llamaindex/rag_with_hyde/requirements.txt +++ b/bootstraprag/templates/llamaindex/rag_with_hyde/requirements.txt @@ -7,4 +7,5 @@ llama-index-embeddings-openai==0.1.11 llama-index-embeddings-ollama==0.1.2 llama-index-vector-stores-qdrant==0.2.14 pydantic==2.9.0 -litserve==0.2.2 \ No newline at end of file +litserve==0.2.2 +deepeval==1.3.2 \ No newline at end of file diff --git a/bootstraprag/templates/llamaindex/rag_with_hyde_with_observability/.env b/bootstraprag/templates/llamaindex/rag_with_hyde_with_observability/.env index d6e9266..6aa970c 100644 --- a/bootstraprag/templates/llamaindex/rag_with_hyde_with_observability/.env +++ b/bootstraprag/templates/llamaindex/rag_with_hyde_with_observability/.env @@ -22,3 +22,5 @@ NOTSET = 0 LIT_SERVER_PORT=8000 LIT_SERVER_WORKERS_PER_DEVICE=4 + +IS_EVALUATION_NEEDED=true diff --git a/bootstraprag/templates/llamaindex/rag_with_hyde_with_observability/base_rag.py b/bootstraprag/templates/llamaindex/rag_with_hyde_with_observability/base_rag.py index 3cfc4e4..3569fc8 100644 --- a/bootstraprag/templates/llamaindex/rag_with_hyde_with_observability/base_rag.py +++ b/bootstraprag/templates/llamaindex/rag_with_hyde_with_observability/base_rag.py @@ -19,6 +19,7 @@ from llama_index.core.retrievers import VectorIndexRetriever from llama_index.core.indices.query.query_transform import HyDEQueryTransform from llama_index.core.base.response.schema import Response, StreamingResponse, AsyncStreamingResponse, PydanticResponse +from rag_evaluator import RAGEvaluator import qdrant_client import logging from dotenv import load_dotenv, find_dotenv @@ -68,9 +69,10 @@ def __init__(self, data_path: str, chunk_size: int = 512, chunk_overlap: int = 2 logger.info("initializing the global settings") Settings.embed_model = embed_model Settings.llm = llm - Settings.transformations = [self.text_parser] + self.rag_evaluator = RAGEvaluator() + self.text_chunks = [] self.doc_ids = [] self.nodes = [] @@ -132,6 +134,8 @@ def _create_index_and_retriever(self): def query(self, query_string: str) -> RESPONSE_TYPE: try: response = self.hyde_query_engine.query(str_or_query_bundle=query_string) + if os.environ.get('IS_EVALUATION_NEEDED') == 'true': + self.rag_evaluator.evaluate(user_query=query_string, response_obj=response) return response except Exception as e: logger.error(f'Error while inference: {e}') diff --git a/bootstraprag/templates/llamaindex/rag_with_hyde_with_observability/rag_evaluator.py b/bootstraprag/templates/llamaindex/rag_with_hyde_with_observability/rag_evaluator.py new file mode 100644 index 0000000..c96d77f --- /dev/null +++ b/bootstraprag/templates/llamaindex/rag_with_hyde_with_observability/rag_evaluator.py @@ -0,0 +1,34 @@ +from deepeval.integrations.llama_index import ( + DeepEvalFaithfulnessEvaluator, + DeepEvalAnswerRelevancyEvaluator, + DeepEvalContextualRelevancyEvaluator +) +from dotenv import load_dotenv, find_dotenv +from typing import Any +import os +import logging + +_ = load_dotenv(find_dotenv()) +logging.basicConfig(level=int(os.environ['INFO'])) +logger = logging.getLogger(__name__) + + +class RAGEvaluator: + def __init__(self): + self.faithfulness_evaluator = DeepEvalFaithfulnessEvaluator() + self.answer_relevancy_evaluator = DeepEvalAnswerRelevancyEvaluator() + self.context_relevancy_evaluator = DeepEvalContextualRelevancyEvaluator() + + def evaluate(self, user_query: str, response_obj: Any): + logger.info(f"calling evaluation, user_query: {user_query}, response_obj: {response_obj}") + retrieval_context = [node.get_content() for node in response_obj.source_nodes] + actual_output = response_obj.response + faithfulness_evaluation_response = self.faithfulness_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + answer_relevancy_response = self.answer_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + context_relevancy_response = self.context_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + logger.info(f"faithfulness_evaluation_response: {faithfulness_evaluation_response.score}") + logger.info(f"answer_relevancy_response: {answer_relevancy_response.score}") + logger.info(f"context_relevancy_response: {context_relevancy_response.score}") diff --git a/bootstraprag/templates/llamaindex/rag_with_hyde_with_observability/requirements.txt b/bootstraprag/templates/llamaindex/rag_with_hyde_with_observability/requirements.txt index b5e7372..d9e8ef0 100644 --- a/bootstraprag/templates/llamaindex/rag_with_hyde_with_observability/requirements.txt +++ b/bootstraprag/templates/llamaindex/rag_with_hyde_with_observability/requirements.txt @@ -9,4 +9,5 @@ llama-index-embeddings-ollama==0.3.0 llama-index-vector-stores-qdrant==0.3.0 llama-index-callbacks-arize-phoenix==0.2.1 pydantic==2.9.0 -litserve==0.2.2 \ No newline at end of file +litserve==0.2.2 +deepeval==1.3.2 \ No newline at end of file diff --git a/bootstraprag/templates/llamaindex/rag_with_react/.env b/bootstraprag/templates/llamaindex/rag_with_react/.env index 88b8230..595f232 100644 --- a/bootstraprag/templates/llamaindex/rag_with_react/.env +++ b/bootstraprag/templates/llamaindex/rag_with_react/.env @@ -22,3 +22,5 @@ NOTSET = 0 LIT_SERVER_PORT=8000 LIT_SERVER_WORKERS_PER_DEVICE=4 + +IS_EVALUATION_NEEDED=true diff --git a/bootstraprag/templates/llamaindex/rag_with_react/rag_evaluator.py b/bootstraprag/templates/llamaindex/rag_with_react/rag_evaluator.py new file mode 100644 index 0000000..c96d77f --- /dev/null +++ b/bootstraprag/templates/llamaindex/rag_with_react/rag_evaluator.py @@ -0,0 +1,34 @@ +from deepeval.integrations.llama_index import ( + DeepEvalFaithfulnessEvaluator, + DeepEvalAnswerRelevancyEvaluator, + DeepEvalContextualRelevancyEvaluator +) +from dotenv import load_dotenv, find_dotenv +from typing import Any +import os +import logging + +_ = load_dotenv(find_dotenv()) +logging.basicConfig(level=int(os.environ['INFO'])) +logger = logging.getLogger(__name__) + + +class RAGEvaluator: + def __init__(self): + self.faithfulness_evaluator = DeepEvalFaithfulnessEvaluator() + self.answer_relevancy_evaluator = DeepEvalAnswerRelevancyEvaluator() + self.context_relevancy_evaluator = DeepEvalContextualRelevancyEvaluator() + + def evaluate(self, user_query: str, response_obj: Any): + logger.info(f"calling evaluation, user_query: {user_query}, response_obj: {response_obj}") + retrieval_context = [node.get_content() for node in response_obj.source_nodes] + actual_output = response_obj.response + faithfulness_evaluation_response = self.faithfulness_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + answer_relevancy_response = self.answer_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + context_relevancy_response = self.context_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + logger.info(f"faithfulness_evaluation_response: {faithfulness_evaluation_response.score}") + logger.info(f"answer_relevancy_response: {answer_relevancy_response.score}") + logger.info(f"context_relevancy_response: {context_relevancy_response.score}") diff --git a/bootstraprag/templates/llamaindex/rag_with_react/react_agent_with_query_engine.py b/bootstraprag/templates/llamaindex/rag_with_react/react_agent_with_query_engine.py index f3a7eea..41518e0 100644 --- a/bootstraprag/templates/llamaindex/rag_with_react/react_agent_with_query_engine.py +++ b/bootstraprag/templates/llamaindex/rag_with_react/react_agent_with_query_engine.py @@ -10,6 +10,7 @@ from llama_index.core.agent import ReActAgent from llama_index.llms.ollama import Ollama from llama_index.core.base.response.schema import Response, StreamingResponse, AsyncStreamingResponse, PydanticResponse +from rag_evaluator import RAGEvaluator from dotenv import load_dotenv, find_dotenv from typing import Union import qdrant_client @@ -57,6 +58,8 @@ def __init__(self, input_dir: str, similarity_top_k: int = 3, chunk_size: int = Settings.chunk_size = chunk_size Settings.chunk_overlap = chunk_overlap + self.rag_evaluator = RAGEvaluator() + # Create a local Qdrant vector store logger.info("initializing the vector store related objects") self.client: qdrant_client.QdrantClient = qdrant_client.QdrantClient(url=os.environ['DB_URL'], @@ -119,6 +122,9 @@ def _create_react_agent(self): def query(self, user_query: str) -> RESPONSE_TYPE: try: - return self.agent.query(str_or_query_bundle=user_query) + response = self.agent.query(str_or_query_bundle=user_query) + if os.environ.get('IS_EVALUATION_NEEDED') == 'true': + self.rag_evaluator.evaluate(user_query=user_query, response_obj=response) + return response except Exception as e: logger.error(f'Error while generating response: {e}') diff --git a/bootstraprag/templates/llamaindex/rag_with_react/requirements.txt b/bootstraprag/templates/llamaindex/rag_with_react/requirements.txt index 85fa73f..d71d85c 100644 --- a/bootstraprag/templates/llamaindex/rag_with_react/requirements.txt +++ b/bootstraprag/templates/llamaindex/rag_with_react/requirements.txt @@ -7,4 +7,5 @@ llama-index-embeddings-ollama==0.3.0 llama-index-vector-stores-qdrant==0.3.0 qdrant-client==1.11.1 pydantic==2.9.0 -litserve==0.2.2 \ No newline at end of file +litserve==0.2.2 +deepeval==1.3.2 \ No newline at end of file diff --git a/bootstraprag/templates/llamaindex/rag_with_react_with_observability/.env b/bootstraprag/templates/llamaindex/rag_with_react_with_observability/.env index 88b8230..595f232 100644 --- a/bootstraprag/templates/llamaindex/rag_with_react_with_observability/.env +++ b/bootstraprag/templates/llamaindex/rag_with_react_with_observability/.env @@ -22,3 +22,5 @@ NOTSET = 0 LIT_SERVER_PORT=8000 LIT_SERVER_WORKERS_PER_DEVICE=4 + +IS_EVALUATION_NEEDED=true diff --git a/bootstraprag/templates/llamaindex/rag_with_react_with_observability/rag_evaluator.py b/bootstraprag/templates/llamaindex/rag_with_react_with_observability/rag_evaluator.py new file mode 100644 index 0000000..c96d77f --- /dev/null +++ b/bootstraprag/templates/llamaindex/rag_with_react_with_observability/rag_evaluator.py @@ -0,0 +1,34 @@ +from deepeval.integrations.llama_index import ( + DeepEvalFaithfulnessEvaluator, + DeepEvalAnswerRelevancyEvaluator, + DeepEvalContextualRelevancyEvaluator +) +from dotenv import load_dotenv, find_dotenv +from typing import Any +import os +import logging + +_ = load_dotenv(find_dotenv()) +logging.basicConfig(level=int(os.environ['INFO'])) +logger = logging.getLogger(__name__) + + +class RAGEvaluator: + def __init__(self): + self.faithfulness_evaluator = DeepEvalFaithfulnessEvaluator() + self.answer_relevancy_evaluator = DeepEvalAnswerRelevancyEvaluator() + self.context_relevancy_evaluator = DeepEvalContextualRelevancyEvaluator() + + def evaluate(self, user_query: str, response_obj: Any): + logger.info(f"calling evaluation, user_query: {user_query}, response_obj: {response_obj}") + retrieval_context = [node.get_content() for node in response_obj.source_nodes] + actual_output = response_obj.response + faithfulness_evaluation_response = self.faithfulness_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + answer_relevancy_response = self.answer_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + context_relevancy_response = self.context_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + logger.info(f"faithfulness_evaluation_response: {faithfulness_evaluation_response.score}") + logger.info(f"answer_relevancy_response: {answer_relevancy_response.score}") + logger.info(f"context_relevancy_response: {context_relevancy_response.score}") diff --git a/bootstraprag/templates/llamaindex/rag_with_react_with_observability/react_agent_with_query_engine.py b/bootstraprag/templates/llamaindex/rag_with_react_with_observability/react_agent_with_query_engine.py index 90f778c..d116d4a 100644 --- a/bootstraprag/templates/llamaindex/rag_with_react_with_observability/react_agent_with_query_engine.py +++ b/bootstraprag/templates/llamaindex/rag_with_react_with_observability/react_agent_with_query_engine.py @@ -10,6 +10,7 @@ from llama_index.core.agent import ReActAgent from llama_index.llms.ollama import Ollama from llama_index.core.base.response.schema import Response, StreamingResponse, AsyncStreamingResponse, PydanticResponse +from rag_evaluator import RAGEvaluator from dotenv import load_dotenv, find_dotenv from typing import Union import llama_index @@ -63,6 +64,8 @@ def __init__(self, input_dir: str, similarity_top_k: int = 3, chunk_size: int = Settings.chunk_size = chunk_size Settings.chunk_overlap = chunk_overlap + self.rag_evaluator = RAGEvaluator() + # Create a local Qdrant vector store logger.info("initializing the vector store related objects") self.client: qdrant_client.QdrantClient = qdrant_client.QdrantClient(url=os.environ['DB_URL'], @@ -128,6 +131,9 @@ def _create_react_agent(self): def query(self, user_query: str) -> RESPONSE_TYPE: try: - return self.agent.query(str_or_query_bundle=user_query) + response = self.agent.query(str_or_query_bundle=user_query) + if os.environ.get('IS_EVALUATION_NEEDED') == 'true': + self.rag_evaluator.evaluate(user_query=user_query, response_obj=response) + return response except Exception as e: logger.error(f'Error while generating response: {e}') diff --git a/bootstraprag/templates/llamaindex/rag_with_react_with_observability/requirements.txt b/bootstraprag/templates/llamaindex/rag_with_react_with_observability/requirements.txt index 8ed72be..d950d1b 100644 --- a/bootstraprag/templates/llamaindex/rag_with_react_with_observability/requirements.txt +++ b/bootstraprag/templates/llamaindex/rag_with_react_with_observability/requirements.txt @@ -4,6 +4,7 @@ arize-phoenix==4.33.1 qdrant-client==1.11.1 pydantic==2.9.0 litserve==0.2.2 +deepeval==1.3.2 llama-index-llms-openai==0.2.3 llama-index-llms-ollama==0.3.1 llama-index-embeddings-openai==0.2.4 diff --git a/bootstraprag/templates/llamaindex/rag_with_self_correction/.env b/bootstraprag/templates/llamaindex/rag_with_self_correction/.env index e42b2b0..4e992b5 100644 --- a/bootstraprag/templates/llamaindex/rag_with_self_correction/.env +++ b/bootstraprag/templates/llamaindex/rag_with_self_correction/.env @@ -22,3 +22,5 @@ NOTSET = 0 LIT_SERVER_PORT=8000 LIT_SERVER_WORKERS_PER_DEVICE=4 + +IS_EVALUATION_NEEDED=true diff --git a/bootstraprag/templates/llamaindex/rag_with_self_correction/rag_evaluator.py b/bootstraprag/templates/llamaindex/rag_with_self_correction/rag_evaluator.py new file mode 100644 index 0000000..c96d77f --- /dev/null +++ b/bootstraprag/templates/llamaindex/rag_with_self_correction/rag_evaluator.py @@ -0,0 +1,34 @@ +from deepeval.integrations.llama_index import ( + DeepEvalFaithfulnessEvaluator, + DeepEvalAnswerRelevancyEvaluator, + DeepEvalContextualRelevancyEvaluator +) +from dotenv import load_dotenv, find_dotenv +from typing import Any +import os +import logging + +_ = load_dotenv(find_dotenv()) +logging.basicConfig(level=int(os.environ['INFO'])) +logger = logging.getLogger(__name__) + + +class RAGEvaluator: + def __init__(self): + self.faithfulness_evaluator = DeepEvalFaithfulnessEvaluator() + self.answer_relevancy_evaluator = DeepEvalAnswerRelevancyEvaluator() + self.context_relevancy_evaluator = DeepEvalContextualRelevancyEvaluator() + + def evaluate(self, user_query: str, response_obj: Any): + logger.info(f"calling evaluation, user_query: {user_query}, response_obj: {response_obj}") + retrieval_context = [node.get_content() for node in response_obj.source_nodes] + actual_output = response_obj.response + faithfulness_evaluation_response = self.faithfulness_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + answer_relevancy_response = self.answer_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + context_relevancy_response = self.context_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + logger.info(f"faithfulness_evaluation_response: {faithfulness_evaluation_response.score}") + logger.info(f"answer_relevancy_response: {answer_relevancy_response.score}") + logger.info(f"context_relevancy_response: {context_relevancy_response.score}") diff --git a/bootstraprag/templates/llamaindex/rag_with_self_correction/requirements.txt b/bootstraprag/templates/llamaindex/rag_with_self_correction/requirements.txt index 85fa73f..d71d85c 100644 --- a/bootstraprag/templates/llamaindex/rag_with_self_correction/requirements.txt +++ b/bootstraprag/templates/llamaindex/rag_with_self_correction/requirements.txt @@ -7,4 +7,5 @@ llama-index-embeddings-ollama==0.3.0 llama-index-vector-stores-qdrant==0.3.0 qdrant-client==1.11.1 pydantic==2.9.0 -litserve==0.2.2 \ No newline at end of file +litserve==0.2.2 +deepeval==1.3.2 \ No newline at end of file diff --git a/bootstraprag/templates/llamaindex/rag_with_self_correction/self_correction_core.py b/bootstraprag/templates/llamaindex/rag_with_self_correction/self_correction_core.py index f42e458..7611557 100644 --- a/bootstraprag/templates/llamaindex/rag_with_self_correction/self_correction_core.py +++ b/bootstraprag/templates/llamaindex/rag_with_self_correction/self_correction_core.py @@ -11,6 +11,7 @@ from llama_index.core.query_engine import RetryQueryEngine, RetrySourceQueryEngine, RetryGuidelineQueryEngine from llama_index.core.evaluation import RelevancyEvaluator, GuidelineEvaluator from llama_index.core.evaluation.guideline import DEFAULT_GUIDELINES +from rag_evaluator import RAGEvaluator from dotenv import load_dotenv, find_dotenv from typing import Union import qdrant_client @@ -56,6 +57,8 @@ def __init__(self, input_dir: str, similarity_top_k: int = 3, chunk_size: int = Settings.chunk_size = chunk_size Settings.chunk_overlap = chunk_overlap + self.rag_evaluator = RAGEvaluator() + # Create a local Qdrant vector store logger.info("initializing the vector store related objects") self.client: qdrant_client.QdrantClient = qdrant_client.QdrantClient(url=os.environ['DB_URL'], @@ -126,4 +129,6 @@ def query_with_guideline_query_engine(self, query: str) -> RESPONSE_TYPE: guideline_eval, resynthesize_query=True, max_retries=self.no_of_retries) retry_guideline_response = retry_guideline_query_engine.query(query) + if os.environ.get('IS_EVALUATION_NEEDED') == 'true': + self.rag_evaluator.evaluate(user_query=query, response_obj=retry_guideline_response) return retry_guideline_response diff --git a/bootstraprag/templates/llamaindex/rag_with_self_correction_with_observability/.env b/bootstraprag/templates/llamaindex/rag_with_self_correction_with_observability/.env index 88b8230..595f232 100644 --- a/bootstraprag/templates/llamaindex/rag_with_self_correction_with_observability/.env +++ b/bootstraprag/templates/llamaindex/rag_with_self_correction_with_observability/.env @@ -22,3 +22,5 @@ NOTSET = 0 LIT_SERVER_PORT=8000 LIT_SERVER_WORKERS_PER_DEVICE=4 + +IS_EVALUATION_NEEDED=true diff --git a/bootstraprag/templates/llamaindex/rag_with_self_correction_with_observability/rag_evaluator.py b/bootstraprag/templates/llamaindex/rag_with_self_correction_with_observability/rag_evaluator.py new file mode 100644 index 0000000..c96d77f --- /dev/null +++ b/bootstraprag/templates/llamaindex/rag_with_self_correction_with_observability/rag_evaluator.py @@ -0,0 +1,34 @@ +from deepeval.integrations.llama_index import ( + DeepEvalFaithfulnessEvaluator, + DeepEvalAnswerRelevancyEvaluator, + DeepEvalContextualRelevancyEvaluator +) +from dotenv import load_dotenv, find_dotenv +from typing import Any +import os +import logging + +_ = load_dotenv(find_dotenv()) +logging.basicConfig(level=int(os.environ['INFO'])) +logger = logging.getLogger(__name__) + + +class RAGEvaluator: + def __init__(self): + self.faithfulness_evaluator = DeepEvalFaithfulnessEvaluator() + self.answer_relevancy_evaluator = DeepEvalAnswerRelevancyEvaluator() + self.context_relevancy_evaluator = DeepEvalContextualRelevancyEvaluator() + + def evaluate(self, user_query: str, response_obj: Any): + logger.info(f"calling evaluation, user_query: {user_query}, response_obj: {response_obj}") + retrieval_context = [node.get_content() for node in response_obj.source_nodes] + actual_output = response_obj.response + faithfulness_evaluation_response = self.faithfulness_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + answer_relevancy_response = self.answer_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + context_relevancy_response = self.context_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + logger.info(f"faithfulness_evaluation_response: {faithfulness_evaluation_response.score}") + logger.info(f"answer_relevancy_response: {answer_relevancy_response.score}") + logger.info(f"context_relevancy_response: {context_relevancy_response.score}") diff --git a/bootstraprag/templates/llamaindex/rag_with_self_correction_with_observability/requirements.txt b/bootstraprag/templates/llamaindex/rag_with_self_correction_with_observability/requirements.txt index 3f98ade..5e06e64 100644 --- a/bootstraprag/templates/llamaindex/rag_with_self_correction_with_observability/requirements.txt +++ b/bootstraprag/templates/llamaindex/rag_with_self_correction_with_observability/requirements.txt @@ -9,4 +9,5 @@ llama-index-callbacks-arize-phoenix==0.2.1 qdrant-client==1.11.1 pydantic==2.9.0 arize-phoenix==4.33.1 -litserve==0.2.2 \ No newline at end of file +litserve==0.2.2 +deepeval==1.3.2 \ No newline at end of file diff --git a/bootstraprag/templates/llamaindex/rag_with_self_correction_with_observability/self_correction_core.py b/bootstraprag/templates/llamaindex/rag_with_self_correction_with_observability/self_correction_core.py index d269e94..2491447 100644 --- a/bootstraprag/templates/llamaindex/rag_with_self_correction_with_observability/self_correction_core.py +++ b/bootstraprag/templates/llamaindex/rag_with_self_correction_with_observability/self_correction_core.py @@ -12,6 +12,7 @@ from llama_index.core.query_engine import RetryQueryEngine, RetrySourceQueryEngine, RetryGuidelineQueryEngine from llama_index.core.evaluation import RelevancyEvaluator, GuidelineEvaluator from llama_index.core.evaluation.guideline import DEFAULT_GUIDELINES +from rag_evaluator import RAGEvaluator from dotenv import load_dotenv, find_dotenv from typing import Union import qdrant_client @@ -63,6 +64,8 @@ def __init__(self, input_dir: str, similarity_top_k: int = 3, chunk_size: int = Settings.chunk_size = chunk_size Settings.chunk_overlap = chunk_overlap + self.rag_evaluator = RAGEvaluator() + # Create a local Qdrant vector store logger.info("initializing the vector store related objects") self.client: qdrant_client.QdrantClient = qdrant_client.QdrantClient(url=os.environ['DB_URL'], @@ -131,4 +134,6 @@ def query_with_guideline_query_engine(self, query: str) -> RESPONSE_TYPE: retry_guideline_query_engine = RetryGuidelineQueryEngine(self.base_query_engine, guideline_eval, resynthesize_query=True) retry_guideline_response = retry_guideline_query_engine.query(query) + if os.environ.get('IS_EVALUATION_NEEDED') == 'true': + self.rag_evaluator.evaluate(user_query=query, response_obj=retry_guideline_response) return retry_guideline_response diff --git a/bootstraprag/templates/llamaindex/simple_rag/.env b/bootstraprag/templates/llamaindex/simple_rag/.env index 8bb2d5d..34130ba 100644 --- a/bootstraprag/templates/llamaindex/simple_rag/.env +++ b/bootstraprag/templates/llamaindex/simple_rag/.env @@ -23,3 +23,5 @@ NOTSET = 0 LIT_SERVER_PORT=8000 LIT_SERVER_WORKERS_PER_DEVICE=4 + +IS_EVALUATION_NEEDED=true diff --git a/bootstraprag/templates/llamaindex/simple_rag/rag_evaluator.py b/bootstraprag/templates/llamaindex/simple_rag/rag_evaluator.py new file mode 100644 index 0000000..c96d77f --- /dev/null +++ b/bootstraprag/templates/llamaindex/simple_rag/rag_evaluator.py @@ -0,0 +1,34 @@ +from deepeval.integrations.llama_index import ( + DeepEvalFaithfulnessEvaluator, + DeepEvalAnswerRelevancyEvaluator, + DeepEvalContextualRelevancyEvaluator +) +from dotenv import load_dotenv, find_dotenv +from typing import Any +import os +import logging + +_ = load_dotenv(find_dotenv()) +logging.basicConfig(level=int(os.environ['INFO'])) +logger = logging.getLogger(__name__) + + +class RAGEvaluator: + def __init__(self): + self.faithfulness_evaluator = DeepEvalFaithfulnessEvaluator() + self.answer_relevancy_evaluator = DeepEvalAnswerRelevancyEvaluator() + self.context_relevancy_evaluator = DeepEvalContextualRelevancyEvaluator() + + def evaluate(self, user_query: str, response_obj: Any): + logger.info(f"calling evaluation, user_query: {user_query}, response_obj: {response_obj}") + retrieval_context = [node.get_content() for node in response_obj.source_nodes] + actual_output = response_obj.response + faithfulness_evaluation_response = self.faithfulness_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + answer_relevancy_response = self.answer_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + context_relevancy_response = self.context_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + logger.info(f"faithfulness_evaluation_response: {faithfulness_evaluation_response.score}") + logger.info(f"answer_relevancy_response: {answer_relevancy_response.score}") + logger.info(f"context_relevancy_response: {context_relevancy_response.score}") diff --git a/bootstraprag/templates/llamaindex/simple_rag/requirements.txt b/bootstraprag/templates/llamaindex/simple_rag/requirements.txt index d78d1a0..9540f9c 100644 --- a/bootstraprag/templates/llamaindex/simple_rag/requirements.txt +++ b/bootstraprag/templates/llamaindex/simple_rag/requirements.txt @@ -8,4 +8,4 @@ llama-index-vector-stores-qdrant==0.3.0 qdrant-client==1.11.1 pydantic==2.9.0 litserve==0.2.2 -ragas==0.1.20 \ No newline at end of file +deepeval==1.3.2 diff --git a/bootstraprag/templates/llamaindex/simple_rag/simple_rag.py b/bootstraprag/templates/llamaindex/simple_rag/simple_rag.py index dcc3e2c..0c76107 100644 --- a/bootstraprag/templates/llamaindex/simple_rag/simple_rag.py +++ b/bootstraprag/templates/llamaindex/simple_rag/simple_rag.py @@ -10,7 +10,7 @@ from llama_index.llms.ollama import Ollama from llama_index.core.base.response.schema import Response, StreamingResponse, AsyncStreamingResponse, PydanticResponse from dotenv import load_dotenv, find_dotenv -from test_set_generator import TestSetGenerator +from rag_evaluator import RAGEvaluator from typing import Union import qdrant_client import logging @@ -38,7 +38,7 @@ def __init__(self, input_dir: str, similarity_top_k: int = 3, chunk_size: int = self.query_engine_tools = [] self.show_progress = show_progress - self.test_set_generator = TestSetGenerator() + self.rag_evaluator = RAGEvaluator() # use your prefered vector embeddings model logger.info("initializing the OllamaEmbedding") @@ -67,9 +67,6 @@ def __init__(self, input_dir: str, similarity_top_k: int = 3, chunk_size: int = def _create_index(self): - # create an evaluation test set - self.test_set_generator.generate_test_set(input_dir=self.input_dir) # leaving defaults as is. - if self.client.collection_exists(collection_name=os.environ['COLLECTION_NAME']): try: self._index = VectorStoreIndex.from_vector_store(vector_store=self.vector_store) @@ -94,4 +91,7 @@ def do_rag(self, user_query: str) -> RESPONSE_TYPE: query_engine = self._index.as_query_engine(similarity_top_k=self.similarity_top_k) logger.info("LLM is thinking...") response = query_engine.query(str_or_query_bundle=user_query) + logger.info(f'response: {response}') + if os.environ.get('IS_EVALUATION_NEEDED') == 'true': + self.rag_evaluator.evaluate(user_query=user_query, response_obj=response) return response diff --git a/bootstraprag/templates/llamaindex/simple_rag_with_observability/rag_evaluator.py b/bootstraprag/templates/llamaindex/simple_rag_with_observability/rag_evaluator.py new file mode 100644 index 0000000..c96d77f --- /dev/null +++ b/bootstraprag/templates/llamaindex/simple_rag_with_observability/rag_evaluator.py @@ -0,0 +1,34 @@ +from deepeval.integrations.llama_index import ( + DeepEvalFaithfulnessEvaluator, + DeepEvalAnswerRelevancyEvaluator, + DeepEvalContextualRelevancyEvaluator +) +from dotenv import load_dotenv, find_dotenv +from typing import Any +import os +import logging + +_ = load_dotenv(find_dotenv()) +logging.basicConfig(level=int(os.environ['INFO'])) +logger = logging.getLogger(__name__) + + +class RAGEvaluator: + def __init__(self): + self.faithfulness_evaluator = DeepEvalFaithfulnessEvaluator() + self.answer_relevancy_evaluator = DeepEvalAnswerRelevancyEvaluator() + self.context_relevancy_evaluator = DeepEvalContextualRelevancyEvaluator() + + def evaluate(self, user_query: str, response_obj: Any): + logger.info(f"calling evaluation, user_query: {user_query}, response_obj: {response_obj}") + retrieval_context = [node.get_content() for node in response_obj.source_nodes] + actual_output = response_obj.response + faithfulness_evaluation_response = self.faithfulness_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + answer_relevancy_response = self.answer_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + context_relevancy_response = self.context_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + logger.info(f"faithfulness_evaluation_response: {faithfulness_evaluation_response.score}") + logger.info(f"answer_relevancy_response: {answer_relevancy_response.score}") + logger.info(f"context_relevancy_response: {context_relevancy_response.score}") diff --git a/bootstraprag/templates/llamaindex/simple_rag_with_observability/requirements.txt b/bootstraprag/templates/llamaindex/simple_rag_with_observability/requirements.txt index 93cf554..0525aa2 100644 --- a/bootstraprag/templates/llamaindex/simple_rag_with_observability/requirements.txt +++ b/bootstraprag/templates/llamaindex/simple_rag_with_observability/requirements.txt @@ -9,4 +9,5 @@ qdrant-client==1.11.1 arize-phoenix==4.33.1 llama-index-callbacks-arize-phoenix==0.2.1 pydantic==2.9.0 -litserve==0.2.2 \ No newline at end of file +litserve==0.2.2 +deepeval==1.3.2 \ No newline at end of file diff --git a/bootstraprag/templates/llamaindex/simple_rag_with_observability/simple_rag.py b/bootstraprag/templates/llamaindex/simple_rag_with_observability/simple_rag.py index 78dfe4c..066c11e 100644 --- a/bootstraprag/templates/llamaindex/simple_rag_with_observability/simple_rag.py +++ b/bootstraprag/templates/llamaindex/simple_rag_with_observability/simple_rag.py @@ -4,12 +4,12 @@ StorageContext, Settings ) -from llama_index.core.tools import QueryEngineTool, ToolMetadata from llama_index.embeddings.ollama import OllamaEmbedding from llama_index.vector_stores.qdrant import QdrantVectorStore from llama_index.core.agent import ReActAgent from llama_index.llms.ollama import Ollama from llama_index.core.base.response.schema import Response, StreamingResponse, AsyncStreamingResponse, PydanticResponse +from rag_evaluator import RAGEvaluator from dotenv import load_dotenv, find_dotenv from typing import Union import qdrant_client @@ -44,6 +44,8 @@ def __init__(self, input_dir: str, similarity_top_k: int = 3, chunk_size: int = self.query_engine_tools = [] self.show_progress = show_progress + self.rag_evaluator = RAGEvaluator() + # use your prefered vector embeddings model logger.info("initializing the OllamaEmbedding") embed_model = OllamaEmbedding(model_name=os.environ['OLLAMA_EMBED_MODEL'], @@ -93,4 +95,7 @@ def do_rag(self, user_query: str) -> RESPONSE_TYPE: query_engine = self._index.as_query_engine(similarity_top_k=self.similarity_top_k) logger.info("LLM is thinking...") response = query_engine.query(str_or_query_bundle=user_query) + logger.info(f'response: {response}') + if os.environ.get('IS_EVALUATION_NEEDED') == 'true': + self.rag_evaluator.evaluate(user_query=user_query, response_obj=response) return response