From 98a79be54a48b407de14496d65db733c22c4c096 Mon Sep 17 00:00:00 2001 From: pavanmantha Date: Mon, 30 Sep 2024 22:16:36 +0530 Subject: [PATCH 1/2] removed the secret --- .../templates/llamaindex/simple_rag/.env | 5 +- .../llamaindex/simple_rag/requirements.txt | 3 +- .../llamaindex/simple_rag/simple_rag.py | 7 +++ .../simple_rag/test_set_generator.py | 48 +++++++++++++++++++ 4 files changed, 60 insertions(+), 3 deletions(-) create mode 100644 bootstraprag/templates/llamaindex/simple_rag/test_set_generator.py diff --git a/bootstraprag/templates/llamaindex/simple_rag/.env b/bootstraprag/templates/llamaindex/simple_rag/.env index 88b8230..8bb2d5d 100644 --- a/bootstraprag/templates/llamaindex/simple_rag/.env +++ b/bootstraprag/templates/llamaindex/simple_rag/.env @@ -1,9 +1,10 @@ DB_URL='http://localhost:6333' DB_API_KEY='th3s3cr3tk3y' -COLLECTION_NAME='YOUR_COLLECTION' +COLLECTION_NAME='YOUR_TEST_COLLECTION' OPENAI_API_KEY='' -OPENAI_EMBED_MODEL='' +OPENAI_EMBED_MODEL='text-embedding-3-small' +OPENAI_MODEL='gpt-4o' # use this incase you are prefering to experiment with local models. OLLAMA_BASE_URL='http://localhost:11434' diff --git a/bootstraprag/templates/llamaindex/simple_rag/requirements.txt b/bootstraprag/templates/llamaindex/simple_rag/requirements.txt index 85fa73f..d78d1a0 100644 --- a/bootstraprag/templates/llamaindex/simple_rag/requirements.txt +++ b/bootstraprag/templates/llamaindex/simple_rag/requirements.txt @@ -7,4 +7,5 @@ llama-index-embeddings-ollama==0.3.0 llama-index-vector-stores-qdrant==0.3.0 qdrant-client==1.11.1 pydantic==2.9.0 -litserve==0.2.2 \ No newline at end of file +litserve==0.2.2 +ragas==0.1.20 \ No newline at end of file diff --git a/bootstraprag/templates/llamaindex/simple_rag/simple_rag.py b/bootstraprag/templates/llamaindex/simple_rag/simple_rag.py index d2b6df9..dcc3e2c 100644 --- a/bootstraprag/templates/llamaindex/simple_rag/simple_rag.py +++ b/bootstraprag/templates/llamaindex/simple_rag/simple_rag.py @@ -10,6 +10,7 @@ from llama_index.llms.ollama import Ollama from llama_index.core.base.response.schema import Response, StreamingResponse, AsyncStreamingResponse, PydanticResponse from dotenv import load_dotenv, find_dotenv +from test_set_generator import TestSetGenerator from typing import Union import qdrant_client import logging @@ -37,6 +38,8 @@ def __init__(self, input_dir: str, similarity_top_k: int = 3, chunk_size: int = self.query_engine_tools = [] self.show_progress = show_progress + self.test_set_generator = TestSetGenerator() + # use your prefered vector embeddings model logger.info("initializing the OllamaEmbedding") embed_model = OllamaEmbedding(model_name=os.environ['OLLAMA_EMBED_MODEL'], @@ -63,6 +66,10 @@ def __init__(self, input_dir: str, similarity_top_k: int = 3, chunk_size: int = self._create_index() def _create_index(self): + + # create an evaluation test set + self.test_set_generator.generate_test_set(input_dir=self.input_dir) # leaving defaults as is. + if self.client.collection_exists(collection_name=os.environ['COLLECTION_NAME']): try: self._index = VectorStoreIndex.from_vector_store(vector_store=self.vector_store) diff --git a/bootstraprag/templates/llamaindex/simple_rag/test_set_generator.py b/bootstraprag/templates/llamaindex/simple_rag/test_set_generator.py new file mode 100644 index 0000000..d3a7f5a --- /dev/null +++ b/bootstraprag/templates/llamaindex/simple_rag/test_set_generator.py @@ -0,0 +1,48 @@ +from llama_index.core import SimpleDirectoryReader +from ragas.testset.generator import TestsetGenerator +from ragas.testset.evolutions import simple, reasoning, multi_context +from llama_index.llms.ollama import Ollama +from llama_index.llms.openai import OpenAI +from llama_index.embeddings.ollama import OllamaEmbedding +from llama_index.embeddings.openai import OpenAIEmbedding +from dotenv import load_dotenv, find_dotenv +import logging +import os + +_ = load_dotenv(find_dotenv()) +logging.basicConfig(level=int(os.environ['INFO'])) +logger = logging.getLogger(__name__) + + +class TestSetGenerator: + def __init__(self): + # generator with ollama models + # generator_llm = Ollama(model=os.environ.get('OLLAMA_LLM_MODEL'), request_timeout=300) + # critic_llm = Ollama(model=os.environ.get('OLLAMA_LLM_MODEL'), request_timeout=300) + # embeddings = OllamaEmbedding(model_name=os.environ.get('OLLAMA_EMBED_MODEL'), request_timeout=300) + + # generator with openai models + generator_llm = OpenAI(model=os.environ.get('OPENAI_MODEL'), timeout=300) + critic_llm = OpenAI(model=os.environ.get('OPENAI_MODEL'), timeout=300) + embeddings = OpenAIEmbedding(model=os.environ.get('OPENAI_EMBED_MODEL'), timeout=300) + + self.generator = TestsetGenerator.from_llama_index( + generator_llm=generator_llm, + critic_llm=critic_llm, + embeddings=embeddings, + ) + + def generate_test_set(self, input_dir, test_size: int = 5, simple_dist: float = 0.5, reasoning_dist: float = 0.25, + multi_context_dist: float = 0.25, show_progress: bool = True): + logger.info('loading docs..') + _docs = SimpleDirectoryReader(input_dir=input_dir).load_data(show_progress=show_progress) + # generate test set + logger.info('test set generation started..') + test_set = self.generator.generate_with_llamaindex_docs( + documents=_docs, + test_size=test_size, + distributions={simple: simple_dist, reasoning: reasoning_dist, multi_context: multi_context_dist}, + ) + logger.info('test set generation ended..filename: testset.csv') + df = test_set.to_pandas() + df.to_csv(filename="testset.csv") From 65fde87710a7b3f410e27d5da63e933e702b9f8c Mon Sep 17 00:00:00 2001 From: pavanmantha Date: Tue, 1 Oct 2024 23:00:20 +0530 Subject: [PATCH 2/2] -integrated deepeval evaluations --- .../templates/llamaindex/simple_rag/.env | 2 + .../llamaindex/simple_rag/rag_evaluator.py | 34 +++++++++++++ .../llamaindex/simple_rag/requirements.txt | 2 +- .../llamaindex/simple_rag/simple_rag.py | 10 ++-- .../simple_rag/test_set_generator.py | 48 ------------------- .../rag_evaluator.py | 34 +++++++++++++ .../requirements.txt | 3 +- .../simple_rag.py | 7 ++- 8 files changed, 84 insertions(+), 56 deletions(-) create mode 100644 bootstraprag/templates/llamaindex/simple_rag/rag_evaluator.py delete mode 100644 bootstraprag/templates/llamaindex/simple_rag/test_set_generator.py create mode 100644 bootstraprag/templates/llamaindex/simple_rag_with_observability/rag_evaluator.py diff --git a/bootstraprag/templates/llamaindex/simple_rag/.env b/bootstraprag/templates/llamaindex/simple_rag/.env index 8bb2d5d..34130ba 100644 --- a/bootstraprag/templates/llamaindex/simple_rag/.env +++ b/bootstraprag/templates/llamaindex/simple_rag/.env @@ -23,3 +23,5 @@ NOTSET = 0 LIT_SERVER_PORT=8000 LIT_SERVER_WORKERS_PER_DEVICE=4 + +IS_EVALUATION_NEEDED=true diff --git a/bootstraprag/templates/llamaindex/simple_rag/rag_evaluator.py b/bootstraprag/templates/llamaindex/simple_rag/rag_evaluator.py new file mode 100644 index 0000000..c96d77f --- /dev/null +++ b/bootstraprag/templates/llamaindex/simple_rag/rag_evaluator.py @@ -0,0 +1,34 @@ +from deepeval.integrations.llama_index import ( + DeepEvalFaithfulnessEvaluator, + DeepEvalAnswerRelevancyEvaluator, + DeepEvalContextualRelevancyEvaluator +) +from dotenv import load_dotenv, find_dotenv +from typing import Any +import os +import logging + +_ = load_dotenv(find_dotenv()) +logging.basicConfig(level=int(os.environ['INFO'])) +logger = logging.getLogger(__name__) + + +class RAGEvaluator: + def __init__(self): + self.faithfulness_evaluator = DeepEvalFaithfulnessEvaluator() + self.answer_relevancy_evaluator = DeepEvalAnswerRelevancyEvaluator() + self.context_relevancy_evaluator = DeepEvalContextualRelevancyEvaluator() + + def evaluate(self, user_query: str, response_obj: Any): + logger.info(f"calling evaluation, user_query: {user_query}, response_obj: {response_obj}") + retrieval_context = [node.get_content() for node in response_obj.source_nodes] + actual_output = response_obj.response + faithfulness_evaluation_response = self.faithfulness_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + answer_relevancy_response = self.answer_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + context_relevancy_response = self.context_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + logger.info(f"faithfulness_evaluation_response: {faithfulness_evaluation_response.score}") + logger.info(f"answer_relevancy_response: {answer_relevancy_response.score}") + logger.info(f"context_relevancy_response: {context_relevancy_response.score}") diff --git a/bootstraprag/templates/llamaindex/simple_rag/requirements.txt b/bootstraprag/templates/llamaindex/simple_rag/requirements.txt index d78d1a0..d71d85c 100644 --- a/bootstraprag/templates/llamaindex/simple_rag/requirements.txt +++ b/bootstraprag/templates/llamaindex/simple_rag/requirements.txt @@ -8,4 +8,4 @@ llama-index-vector-stores-qdrant==0.3.0 qdrant-client==1.11.1 pydantic==2.9.0 litserve==0.2.2 -ragas==0.1.20 \ No newline at end of file +deepeval==1.3.2 \ No newline at end of file diff --git a/bootstraprag/templates/llamaindex/simple_rag/simple_rag.py b/bootstraprag/templates/llamaindex/simple_rag/simple_rag.py index dcc3e2c..0c76107 100644 --- a/bootstraprag/templates/llamaindex/simple_rag/simple_rag.py +++ b/bootstraprag/templates/llamaindex/simple_rag/simple_rag.py @@ -10,7 +10,7 @@ from llama_index.llms.ollama import Ollama from llama_index.core.base.response.schema import Response, StreamingResponse, AsyncStreamingResponse, PydanticResponse from dotenv import load_dotenv, find_dotenv -from test_set_generator import TestSetGenerator +from rag_evaluator import RAGEvaluator from typing import Union import qdrant_client import logging @@ -38,7 +38,7 @@ def __init__(self, input_dir: str, similarity_top_k: int = 3, chunk_size: int = self.query_engine_tools = [] self.show_progress = show_progress - self.test_set_generator = TestSetGenerator() + self.rag_evaluator = RAGEvaluator() # use your prefered vector embeddings model logger.info("initializing the OllamaEmbedding") @@ -67,9 +67,6 @@ def __init__(self, input_dir: str, similarity_top_k: int = 3, chunk_size: int = def _create_index(self): - # create an evaluation test set - self.test_set_generator.generate_test_set(input_dir=self.input_dir) # leaving defaults as is. - if self.client.collection_exists(collection_name=os.environ['COLLECTION_NAME']): try: self._index = VectorStoreIndex.from_vector_store(vector_store=self.vector_store) @@ -94,4 +91,7 @@ def do_rag(self, user_query: str) -> RESPONSE_TYPE: query_engine = self._index.as_query_engine(similarity_top_k=self.similarity_top_k) logger.info("LLM is thinking...") response = query_engine.query(str_or_query_bundle=user_query) + logger.info(f'response: {response}') + if os.environ.get('IS_EVALUATION_NEEDED') == 'true': + self.rag_evaluator.evaluate(user_query=user_query, response_obj=response) return response diff --git a/bootstraprag/templates/llamaindex/simple_rag/test_set_generator.py b/bootstraprag/templates/llamaindex/simple_rag/test_set_generator.py deleted file mode 100644 index d3a7f5a..0000000 --- a/bootstraprag/templates/llamaindex/simple_rag/test_set_generator.py +++ /dev/null @@ -1,48 +0,0 @@ -from llama_index.core import SimpleDirectoryReader -from ragas.testset.generator import TestsetGenerator -from ragas.testset.evolutions import simple, reasoning, multi_context -from llama_index.llms.ollama import Ollama -from llama_index.llms.openai import OpenAI -from llama_index.embeddings.ollama import OllamaEmbedding -from llama_index.embeddings.openai import OpenAIEmbedding -from dotenv import load_dotenv, find_dotenv -import logging -import os - -_ = load_dotenv(find_dotenv()) -logging.basicConfig(level=int(os.environ['INFO'])) -logger = logging.getLogger(__name__) - - -class TestSetGenerator: - def __init__(self): - # generator with ollama models - # generator_llm = Ollama(model=os.environ.get('OLLAMA_LLM_MODEL'), request_timeout=300) - # critic_llm = Ollama(model=os.environ.get('OLLAMA_LLM_MODEL'), request_timeout=300) - # embeddings = OllamaEmbedding(model_name=os.environ.get('OLLAMA_EMBED_MODEL'), request_timeout=300) - - # generator with openai models - generator_llm = OpenAI(model=os.environ.get('OPENAI_MODEL'), timeout=300) - critic_llm = OpenAI(model=os.environ.get('OPENAI_MODEL'), timeout=300) - embeddings = OpenAIEmbedding(model=os.environ.get('OPENAI_EMBED_MODEL'), timeout=300) - - self.generator = TestsetGenerator.from_llama_index( - generator_llm=generator_llm, - critic_llm=critic_llm, - embeddings=embeddings, - ) - - def generate_test_set(self, input_dir, test_size: int = 5, simple_dist: float = 0.5, reasoning_dist: float = 0.25, - multi_context_dist: float = 0.25, show_progress: bool = True): - logger.info('loading docs..') - _docs = SimpleDirectoryReader(input_dir=input_dir).load_data(show_progress=show_progress) - # generate test set - logger.info('test set generation started..') - test_set = self.generator.generate_with_llamaindex_docs( - documents=_docs, - test_size=test_size, - distributions={simple: simple_dist, reasoning: reasoning_dist, multi_context: multi_context_dist}, - ) - logger.info('test set generation ended..filename: testset.csv') - df = test_set.to_pandas() - df.to_csv(filename="testset.csv") diff --git a/bootstraprag/templates/llamaindex/simple_rag_with_observability/rag_evaluator.py b/bootstraprag/templates/llamaindex/simple_rag_with_observability/rag_evaluator.py new file mode 100644 index 0000000..c96d77f --- /dev/null +++ b/bootstraprag/templates/llamaindex/simple_rag_with_observability/rag_evaluator.py @@ -0,0 +1,34 @@ +from deepeval.integrations.llama_index import ( + DeepEvalFaithfulnessEvaluator, + DeepEvalAnswerRelevancyEvaluator, + DeepEvalContextualRelevancyEvaluator +) +from dotenv import load_dotenv, find_dotenv +from typing import Any +import os +import logging + +_ = load_dotenv(find_dotenv()) +logging.basicConfig(level=int(os.environ['INFO'])) +logger = logging.getLogger(__name__) + + +class RAGEvaluator: + def __init__(self): + self.faithfulness_evaluator = DeepEvalFaithfulnessEvaluator() + self.answer_relevancy_evaluator = DeepEvalAnswerRelevancyEvaluator() + self.context_relevancy_evaluator = DeepEvalContextualRelevancyEvaluator() + + def evaluate(self, user_query: str, response_obj: Any): + logger.info(f"calling evaluation, user_query: {user_query}, response_obj: {response_obj}") + retrieval_context = [node.get_content() for node in response_obj.source_nodes] + actual_output = response_obj.response + faithfulness_evaluation_response = self.faithfulness_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + answer_relevancy_response = self.answer_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + context_relevancy_response = self.context_relevancy_evaluator.evaluate(query=user_query, response=actual_output, + contexts=retrieval_context) + logger.info(f"faithfulness_evaluation_response: {faithfulness_evaluation_response.score}") + logger.info(f"answer_relevancy_response: {answer_relevancy_response.score}") + logger.info(f"context_relevancy_response: {context_relevancy_response.score}") diff --git a/bootstraprag/templates/llamaindex/simple_rag_with_observability/requirements.txt b/bootstraprag/templates/llamaindex/simple_rag_with_observability/requirements.txt index 93cf554..0525aa2 100644 --- a/bootstraprag/templates/llamaindex/simple_rag_with_observability/requirements.txt +++ b/bootstraprag/templates/llamaindex/simple_rag_with_observability/requirements.txt @@ -9,4 +9,5 @@ qdrant-client==1.11.1 arize-phoenix==4.33.1 llama-index-callbacks-arize-phoenix==0.2.1 pydantic==2.9.0 -litserve==0.2.2 \ No newline at end of file +litserve==0.2.2 +deepeval==1.3.2 \ No newline at end of file diff --git a/bootstraprag/templates/llamaindex/simple_rag_with_observability/simple_rag.py b/bootstraprag/templates/llamaindex/simple_rag_with_observability/simple_rag.py index 78dfe4c..066c11e 100644 --- a/bootstraprag/templates/llamaindex/simple_rag_with_observability/simple_rag.py +++ b/bootstraprag/templates/llamaindex/simple_rag_with_observability/simple_rag.py @@ -4,12 +4,12 @@ StorageContext, Settings ) -from llama_index.core.tools import QueryEngineTool, ToolMetadata from llama_index.embeddings.ollama import OllamaEmbedding from llama_index.vector_stores.qdrant import QdrantVectorStore from llama_index.core.agent import ReActAgent from llama_index.llms.ollama import Ollama from llama_index.core.base.response.schema import Response, StreamingResponse, AsyncStreamingResponse, PydanticResponse +from rag_evaluator import RAGEvaluator from dotenv import load_dotenv, find_dotenv from typing import Union import qdrant_client @@ -44,6 +44,8 @@ def __init__(self, input_dir: str, similarity_top_k: int = 3, chunk_size: int = self.query_engine_tools = [] self.show_progress = show_progress + self.rag_evaluator = RAGEvaluator() + # use your prefered vector embeddings model logger.info("initializing the OllamaEmbedding") embed_model = OllamaEmbedding(model_name=os.environ['OLLAMA_EMBED_MODEL'], @@ -93,4 +95,7 @@ def do_rag(self, user_query: str) -> RESPONSE_TYPE: query_engine = self._index.as_query_engine(similarity_top_k=self.similarity_top_k) logger.info("LLM is thinking...") response = query_engine.query(str_or_query_bundle=user_query) + logger.info(f'response: {response}') + if os.environ.get('IS_EVALUATION_NEEDED') == 'true': + self.rag_evaluator.evaluate(user_query=user_query, response_obj=response) return response