In [1]:
import dotenv
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from langchain_openai import AzureOpenAIEmbeddings
dotenv.load_dotenv()
from langchain_community.vectorstores import Chroma
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_similarity,
    answer_correctness,
)
from datasets import Dataset
from langchain_openai import AzureChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
import os
from langchain.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
import getpass
from langchain.retrievers.document_compressors import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.document_loaders import PyPDFLoader
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
import sys
sys.tracebacklimit = 0

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION")
AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
OPENAI_MODEL = os.environ.get("OPENAI_MODEL")
OPENAI_DEPLOYMENT = os.environ.get("OPENAI_DEPLOYMENT")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL")
EMBEDDING_DEPLOYMENT = os.environ.get("EMBEDDING_DEPLOYMENT")
OPENAI_MODEL_GPT4 = os.environ.get("OPENAI_MODEL_GPT4")
OPENAI_DEPLOYMENT_GPT4 = os.environ.get("OPENAI_DEPLOYMENT_GPT4")

In [3]:
embeddings_client = AzureOpenAIEmbeddings(
    azure_deployment=EMBEDDING_DEPLOYMENT,
    openai_api_version=OPENAI_API_VERSION)
llm = AzureChatOpenAI(model_name=OPENAI_MODEL, azure_deployment=OPENAI_DEPLOYMENT,temperature=0)
llm_gpt4 = AzureChatOpenAI(model_name=OPENAI_MODEL_GPT4, azure_deployment=OPENAI_DEPLOYMENT_GPT4,temperature=0)

In [4]:
def evaluation_llm(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
    )
    return result

In [5]:
def evaluation_rag(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            context_precision,
            context_recall,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
        raise_exceptions=False,
    )
    return result

In [6]:
questions = [
    "What is RAG?",
    "In the framework for implementing generative AI services using the Retrieval-Augmented Generation (RAG) model, what specific open-source products are chosen for vectorized databases?",
    "What does CRUD stand for when it comes to RAG?",
    "What evaluation metrics can be used to evaluate RAG systems?",
    "What are the four fundamental abilities required for Retrieval-Augmented Generation (RAG) according to Chen et al., and how are they evaluated in the Retrieval-Augmented Generation Benchmark (RGB)?",
    "What method does the Thulke et al. propose to address the computational complexity issue in the knowledge selection task, and how does it achieve a significant reduction in computation time?",
    "How does the proposed system by Matei Bucur address the limitations related to the availability of relevant external documents for form filling in the broader context?",
    "What embedding methods can be used for RAG systems?",
    "According to the experimental results by Wu et al., how much did the second strategy, based on the absence of detected hallucination spans, reduce the hallucination rate for the GPT-3.5-Turbo and GPT-4 models in comparison to random selection?",
    "What are some of the best retrieval strategies in RAG systems?"
]

ground_truths = [
    ["RAG stands for Retrieval Augmented Generation. It is a technique developed to address the challenge of requiring external data for generating more cohesive answers. The approach involves fetching and incorporating information into the prompt, allowing the model to generate responses on subjects and data not encountered during training. This aims to reduce the occurrence of hallucinations in the model's outputs."],
    ["Chroma DB and FAISS."],
    ["In 'CREATE', the system improves the input text by adding relevant information from external sources, making creative outputs such as poetry, stories, or code. In 'READ', the system uses external knowledge retrieval to answer questions, solving problems in question-answering, dialogue, and reasoning, and increasing understanding of the input text. In 'UPDATE', the system fixes errors in the input text using retrieved content, correcting spelling, grammar or factual errors to make the text better. In 'DELETE', the system simplifies the input by improving retrieval results, removing unnecessary details, and doing tasks like text summarization or simplification"],
    ["Accuracy, Rejection rate, Error detection rate, Baseline model Reflexion, Precision of reasoning, Multi-hop reasoning accuracy, Mean Recriprocal Rank (MRR@k), Recall@k, Relative Maximum Score, Mean Average Precision at K (MAP@K), Mean Reciprocal Rank at K (MRR@K),  Hit Rate at K (Hit@K), Exact Match (EM), F1 score, Precision,  Recall, perplexity (PPL), GLEU, BLEU, ROUGE, METEOR,  BERTScore, EM score (Exact Match score), RAGQuestEval, Faithfulness, Recall@1 (R@1), BLEU-1, METEOR, ROUGE-L"],
    ["The four fundamental abilities required for RAG, as mentioned in the paper, are Noise Robustness, Negative Rejection, Information Integration, and Counterfactual Robustness. These abilities are evaluated in RGB through four separate testbeds, each focusing on one of these abilities. The testbeds are constructed based on the common challenges in RAG and include instances that assess the performance of large language models in terms of noise robustness, negative rejection, information integration, and counterfactual robustness."],
    ["They propose two methods to address the computational complexity issue in the knowledge selection task. The first method involves hierarchical selection, which splits the task into stages, reducing the computational cost significantly. The second method employs Dense Knowledge Retrieval, utilizing siamese sequence embedding networks for efficient document retrieval. This method achieves a more than 100x reduction in computation time compared to the baseline, albeit with a 5-6% degradation in R@1 compared to the hierarchical selection method."],
    ["The proposed system addresses the limitations related to the availability of relevant external documents for form filling in the broader context by encouraging users to upload all documents relevant to the specific form-filling task, such as FAQs, policies, guidelines, and rules. These documents are used to create a search index, and the retrieval mechanism leverages this index to generate form completions. Additionally, the system allows users to provide a brief description of the intent and purpose of the form, adding extra short-form documents to enhance the understanding of the task. This approach aims to ensure that the system has access to the necessary information, making it more adaptable to different form-filling scenarios."],
    ["Custom ADA-002, BM25, Customization techniques involving adaptations like Multiple Negative Ranking Loss and Mean Squared Error (MSE) Loss, text-embedding-ada-002, text-search-ada-query-001, llm-embedder, bge-large-en-v1.5, jina-embeddings-v2-base-en, e5-base-v2,  instructor-large, voyage-02."],
    ["The second strategy, based on the absence of detected hallucination spans, achieved a reduction in hallucination rates of 52.9% for GPT-3.5-Turbo and 41% for GPT-4 models, compared to random selection"],
    ["Hybrid search, reranker, multi-time retrieval approach, vector search, BM25, hierarchical selection, prompt engineering."]
]

### General answer by LLMs

In [16]:
def dictionary(result):
    dict_result = dict(result)
    average_score = sum(dict_result.values()) / len(dict_result)
    print(f"The average score is: {average_score}")
    return average_score

In [29]:
template = """{question}"""
prompt = ChatPromptTemplate.from_template(template)

In [30]:
llm_chain =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm}
)
llm_chain_gpt4 =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm_gpt4}
)

In [31]:
answers_llm = []
contexts_llm = [[""],[""],[""],[""],[""],[""],[""],[""],[""],[""]]

In [32]:
for query in questions:
    response = llm_chain.invoke({"question": query})
    answers_llm.append(response["response"].content)

In [33]:
llm_results = evaluation_llm(questions, answers_llm, contexts_llm, ground_truths)
print(llm_results)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.91it/s]


{'faithfulness': 0.9444, 'answer_relevancy': 0.3760, 'answer_similarity': 0.8413, 'answer_correctness': 0.3895}


In [34]:
dict_llm_results = dictionary(llm_results)

The average score is: 0.6377986113727302


In [35]:
answers_llm_gpt4 = []
for query in questions:
    response = llm_chain_gpt4.invoke({"question": query})
    answers_llm_gpt4.append(response["response"].content)
llm_results_gpt4 = evaluation_llm(questions, answers_llm_gpt4, contexts_llm, ground_truths)
print(llm_results_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 40/40 [00:11<00:00,  3.45it/s]


{'faithfulness': 0.9594, 'answer_relevancy': 0.8764, 'answer_similarity': 0.9042, 'answer_correctness': 0.5814}


In [36]:
dict_llm_results_gpt4 = dictionary(llm_results_gpt4)

The average score is: 0.8303702964930484


# RAG

### Naive RAG

In [7]:
def retrieval_chain(prompt, retriever, llm):
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
    )
    return retrieval_augmented_qa_chain

In [8]:
loader = DirectoryLoader('../papers', glob="./*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [39]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = CharacterTextSplitter()
chunks = text_splitter.split_documents(documents)
db_naive = Chroma.from_documents(chunks, embeddings_client, persist_directory = "../papers/vectordb/naive")
db_naive.persist()
retriever_naive = db_naive.as_retriever()

In [40]:
db_naive = Chroma(persist_directory = "../papers/vectordb/naive", embedding_function=embeddings_client)
retriever_naive = db_naive.as_retriever()

In [9]:
template = """{question}. 
Provided context {context}."""
prompt = ChatPromptTemplate.from_template(template)

In [42]:
answers_naive = []
contexts_naive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_naive, llm).invoke({"question": query})
        # Access the response content
        answers_naive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_naive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_naive.append("No answer")
        context_full = retriever_naive.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_naive.append(context_content)



In [43]:
result_naive_rag = evaluation_rag(questions, answers_naive, contexts_naive, ground_truths)
print(result_naive_rag)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  18%|█▊        | 11/60 [00:02<00:07,  6.52it/s]Runner in Executor raised an exception
Traceback (most recent call last):
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 58, in _aresults
    r = await future
        ^^^^^^^^^^^^
  File "C:\Users\sigitalapina\AppData\Local\Programs\Python\Python311\Lib\asyncio\tasks.py", line 605, in _wait_for_one
    return f.result()  # May raise f.exception().
           ^^^^^^^^^^
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 91, in wrapped_callable_async
    return counter, await callable(*args, **kwargs)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\sigitalapin

{'faithfulness': 0.8179, 'answer_relevancy': 0.5780, 'context_precision': 0.5228, 'context_recall': 0.5726, 'answer_similarity': 0.7423, 'answer_correctness': 0.6498}


In [44]:
dict_result_naive_rag = dictionary(result_naive_rag)

The average score is: 0.6472394986419802


### recursive splitter

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder()
chunks_r = text_splitter.split_documents(documents)
db_basic = Chroma.from_documents(chunks_r, embeddings_client, persist_directory = "../papers/vectordb/recursive_basic")
db_basic.persist()
retriever_basic = db_basic.as_retriever()

In [45]:
db_basic = Chroma(persist_directory = "../papers/vectordb/recursive_basic", embedding_function=embeddings_client)
retriever_basic = db_basic.as_retriever()

In [46]:
answers_recursive = []
contexts_recursive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_basic, llm).invoke({"question": query})
        # Access the response content
        answers_recursive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_recursive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_recursive.append("No answer")
        context_full = retriever_basic.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_recursive.append(context_content)

In [47]:
result_recursive = evaluation_rag(questions, answers_recursive, contexts_recursive, ground_truths)
print(result_recursive)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.64it/s]


{'faithfulness': 0.8743, 'answer_relevancy': 0.8323, 'context_precision': 0.5972, 'context_recall': 0.7875, 'answer_similarity': 0.8838, 'answer_correctness': 0.4480}


In [48]:
dict_result_recursive = dictionary(result_recursive)

The average score is: 0.7371898552172951


### chunk sizes

In [10]:
def run_and_evaluate(retriever, prompt, llm):
    answers_recursive = []
    contexts_recursive = []

    for query in questions:
        try:  
            response = retrieval_chain(prompt, retriever, llm).invoke({"question": query})
            # Access the response content
            answers_recursive.append(response["response"].content)
            # Access the context content
            context_content = [context.page_content for context in response["context"]]
            contexts_recursive.append(context_content)  
        except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_recursive.append("No answer")
            context_full = retriever.get_relevant_documents(query)
            context_content = [context.page_content for context in context_full]
            contexts_recursive.append(context_content)


    result = evaluation_rag(questions, answers_recursive, contexts_recursive, ground_truths)
    return result

chunk = 1000

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 100)
chunks_1000 = text_splitter.split_documents(documents)
print(len(chunks_1000))
db_1000 = Chroma.from_documents(chunks_1000, embeddings_client, persist_directory = "../papers/vectordb/recursive_1000")
db_1000.persist()

In [11]:
db_1000 = Chroma(persist_directory = "../papers/vectordb/recursive_1000", embedding_function=embeddings_client)

In [51]:
retriever_1000 = db_1000.as_retriever()
result_1000 = run_and_evaluate(retriever_1000, prompt, llm)
print("CHUNK SIZE 1000")
print(result_1000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:13<00:00,  4.39it/s]


CHUNK SIZE 1000
{'faithfulness': 0.9429, 'answer_relevancy': 0.8294, 'context_precision': 0.5889, 'context_recall': 0.8208, 'answer_similarity': 0.8868, 'answer_correctness': 0.4487}


In [52]:
dict_result_1000 = dictionary(result_1000)

The average score is: 0.752912607504265


chunk = 500

In [33]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 500, chunk_overlap = 50)
chunks_500 = text_splitter.split_documents(documents)
print(len(chunks_500))
db_500 = Chroma.from_documents(chunks_500, embeddings_client, persist_directory = "../papers/vectordb/recursive_500")
db_500.persist()

107


In [53]:
db_500 = Chroma(persist_directory = "../papers/vectordb/recursive_500", embedding_function=embeddings_client)

In [54]:
retriever_500 = db_500.as_retriever()
result_500 = run_and_evaluate(retriever_500, prompt, llm)
print("CHUNK SIZE 500")
print(result_500)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-333' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
Traceback (most recent call last):
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py", line 2018, in aclose
    await self._transport.aclose()
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_transports\default.py", line 385, in aclose
    await self._pool.aclose()
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packa

CHUNK SIZE 500
{'faithfulness': 0.6822, 'answer_relevancy': 0.7488, 'context_precision': 0.5967, 'context_recall': 0.7615, 'answer_similarity': 0.8450, 'answer_correctness': 0.6514}


In [55]:
dict_result_500 = dictionary(result_500)

The average score is: 0.7142606903567085


chunk = 2000

In [35]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 2000, chunk_overlap = 200)
chunks_2000 = text_splitter.split_documents(documents)
print(len(chunks_2000))
db_2000 = Chroma.from_documents(chunks_2000, embeddings_client, persist_directory = "../papers/vectordb/recursive_2000")


37


In [56]:
db_2000 = Chroma(persist_directory = "../papers/vectordb/recursive_2000", embedding_function=embeddings_client)

In [57]:
retriever_2000 = db_2000.as_retriever()
result_2000 = run_and_evaluate(retriever_2000, prompt, llm)
print("CHUNK SIZE 2000")
print(result_2000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:13<00:00,  4.50it/s]


CHUNK SIZE 2000
{'faithfulness': 0.7667, 'answer_relevancy': 0.8230, 'context_precision': 0.5972, 'context_recall': 0.8208, 'answer_similarity': 0.8832, 'answer_correctness': 0.5073}


In [58]:
dict_result_2000 = dictionary(result_2000)

The average score is: 0.7330396049761241


chunk = 3000

In [38]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 3000, chunk_overlap = 300)
chunks_3000 = text_splitter.split_documents(documents)
print(len(chunks_3000))
db_3000 = Chroma.from_documents(chunks_3000, embeddings_client, persist_directory = "../papers/vectordb/recursive_3000")

32


In [59]:
db_3000 = Chroma(persist_directory = "../papers/vectordb/recursive_3000", embedding_function=embeddings_client)

In [60]:
retriever_3000 = db_3000.as_retriever()
result_3000 = run_and_evaluate(retriever_3000, prompt, llm)
print("CHUNK SIZE 3000")
print(result_3000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.05it/s]


CHUNK SIZE 3000
{'faithfulness': 0.8595, 'answer_relevancy': 0.8386, 'context_precision': 0.6333, 'context_recall': 0.8208, 'answer_similarity': 0.8843, 'answer_correctness': 0.4686}


In [61]:
dict_result_3000 = dictionary(result_3000)

The average score is: 0.7508713801108652


### now time to look for different top-k

Note: We continue with the size chunk of 1000 as it had the highest average score

In [63]:
retriever_3 = db_1000.as_retriever(search_kwargs={"k": 3})
result_3 = run_and_evaluate(retriever_3, prompt, llm)
print("CHUNK SIZE 1000, K=3")
print(result_3)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  18%|█▊        | 11/60 [00:02<00:06,  7.10it/s]Runner in Executor raised an exception
Traceback (most recent call last):
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 58, in _aresults
    r = await future
        ^^^^^^^^^^^^
  File "C:\Users\sigitalapina\AppData\Local\Programs\Python\Python311\Lib\asyncio\tasks.py", line 605, in _wait_for_one
    return f.result()  # May raise f.exception().
           ^^^^^^^^^^
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 91, in wrapped_callable_async
    return counter, await callable(*args, **kwargs)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\sigitalapin

CHUNK SIZE 1000, K=3
{'faithfulness': 0.5797, 'answer_relevancy': 0.7975, 'context_precision': 0.5393, 'context_recall': 0.6875, 'answer_similarity': 0.7113, 'answer_correctness': 0.7793}


In [64]:
dict_result_3 = dictionary(result_3)

The average score is: 0.6824532277442156


In [65]:
retriever_5 = db_1000.as_retriever(search_kwargs={"k": 5})
result_5 = run_and_evaluate(retriever_5, prompt, llm)
print("CHUNK SIZE 1000, K=5")
print(result_5)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.14it/s]


CHUNK SIZE 1000, K=5
{'faithfulness': 0.8000, 'answer_relevancy': 0.8256, 'context_precision': 0.5815, 'context_recall': 0.7800, 'answer_similarity': 0.8859, 'answer_correctness': 0.4240}


In [66]:
dict_result_5 = dictionary(result_5)

The average score is: 0.7161788734282134


In [67]:
retriever_6= db_1000.as_retriever(search_kwargs={"k": 6})
result_6 = run_and_evaluate(retriever_6, prompt, llm)
print("CHUNK SIZE 1000, K=5")
print(result_6)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.92it/s]


CHUNK SIZE 1000, K=5
{'faithfulness': 0.8256, 'answer_relevancy': 0.8338, 'context_precision': 0.5782, 'context_recall': 0.8800, 'answer_similarity': 0.8907, 'answer_correctness': 0.5280}


In [68]:
dict_result_6 = dictionary(result_6)

The average score is: 0.7560586606303001


In [69]:
retriever_7= db_1000.as_retriever(search_kwargs={"k": 7})
result_7 = run_and_evaluate(retriever_7, prompt, llm)
print("CHUNK SIZE 1000, K=7")
print(result_7)
dict_result_7 = dictionary(result_7)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:13<00:00,  4.49it/s]


CHUNK SIZE 1000, K=7
{'faithfulness': 0.9429, 'answer_relevancy': 0.8166, 'context_precision': 0.5914, 'context_recall': 0.7633, 'answer_similarity': 0.8919, 'answer_correctness': 0.5029}
The average score is: 0.751507469027492


### look for different retrievers

6 chunks was the best score

#### parent document retriever

In [16]:
parent_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

# vectorstore = Chroma(collection_name="split_parents", persist_directory = "../papers/vectordb/parent_summary", embedding_function=embeddings_client)
# vectorstore.persist()
vectorstore = Chroma(persist_directory = "../papers/vectordb/parent_summary", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
    search_kwargs={"k": 6}
)
parent_document_retriever.add_documents(documents)
result_parent = run_and_evaluate(parent_document_retriever, prompt, llm)
print(result_parent)
dict_result_parent = dictionary(result_parent)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.97it/s]


{'faithfulness': 0.8263, 'answer_relevancy': 0.7337, 'context_precision': 0.7000, 'context_recall': 0.8569, 'answer_similarity': 0.8923, 'answer_correctness': 0.4777}
The average score is: 0.7478190875052072


In [17]:
parent_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap = 50)
child_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap = 0)

# vectorstore = Chroma(collection_name="split_parents_small",persist_directory = "../papers/vectordb/parent_small_summary", embedding_function=embeddings_client)
# vectorstore.persist()
vectorstore = Chroma(persist_directory = "../papers/vectordb/parent_small_summary", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_small = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_small,
    parent_splitter=parent_splitter_small,
    search_kwargs={"k": 6}
)
parent_document_retriever_small.add_documents(documents)
result_parent_small = run_and_evaluate(parent_document_retriever_small, prompt, llm)
print(result_parent_small)
dict_result_parent_small = dictionary(result_parent_small)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

{'faithfulness': 0.7989, 'answer_relevancy': 0.4232, 'context_precision': 0.5833, 'context_recall': 0.5544, 'answer_similarity': 0.5782, 'answer_correctness': 0.5696}
The average score is: 0.5845850482989023


In [18]:
parent_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=3000)
child_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=300, chunk_overlap = 0)

# vectorstore = Chroma(collection_name="split_parents_large", persist_directory = "../papers/vectordb/parent_large_summary", embedding_function=embeddings_client)
# vectorstore.persist()
vectorstore = Chroma(persist_directory = "../papers/vectordb/parent_large_summary", embedding_function=embeddings_client)

store = InMemoryStore()
parent_document_retriever_large = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_large,
    parent_splitter=parent_splitter_large,
    search_kwargs={"k": 6}
)
parent_document_retriever_large.add_documents(documents)
result_parent_large = run_and_evaluate(parent_document_retriever_large, prompt, llm)
print(result_parent_large)
dict_result_parent_large = dictionary(result_parent_large)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.88it/s]


{'faithfulness': 0.7946, 'answer_relevancy': 0.6375, 'context_precision': 0.5542, 'context_recall': 0.7100, 'answer_similarity': 0.8907, 'answer_correctness': 0.4764}
The average score is: 0.6772374702311944


#### Maximum marginal relevance retrieval

In [19]:
retriever_mmr = db_1000.as_retriever(search_type="mmr",search_kwargs={"k": 6})
result_mmr = run_and_evaluate(retriever_mmr, prompt, llm)
print("Marginal relevance")
print(result_mmr)
average_score_mmr = dictionary(result_mmr)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:13<00:00,  4.61it/s]


Marginal relevance
{'faithfulness': 0.8167, 'answer_relevancy': 0.8113, 'context_precision': 0.6381, 'context_recall': 0.9667, 'answer_similarity': 0.9001, 'answer_correctness': 0.5372}
The average score is: 0.7783108799399754


#### BM25

In [13]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 100)
chunks_1000 = text_splitter.split_documents(documents)

In [21]:
retriever_bm25 = BM25Retriever.from_documents(chunks_1000)
result_bm25 = run_and_evaluate(retriever_bm25, prompt, llm)
print("BM25")
print(result_bm25)
average_score_bm25 = dictionary(result_bm25)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'low'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'me

BM25
{'faithfulness': 0.8168, 'answer_relevancy': 0.4753, 'context_precision': 0.8400, 'context_recall': 0.4261, 'answer_similarity': 0.5471, 'answer_correctness': 0.6587}
The average score is: 0.6273412574850488


#### Ensambler - Hybrid

In [22]:
retriever_6= db_1000.as_retriever(search_kwargs={"k": 6})

In [23]:
ensemble_retriever_1 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_6], weights=[0.75, 0.25])
result_ensemble1 = run_and_evaluate(ensemble_retriever_1, prompt, llm)
print("Ensambler 75/25")
print(result_ensemble1)
average_score_ensambler1 = dictionary(result_ensemble1)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.24it/s]


Ensambler 75/25
{'faithfulness': 0.9714, 'answer_relevancy': 0.7870, 'context_precision': 0.4741, 'context_recall': 0.7500, 'answer_similarity': 0.8999, 'answer_correctness': 0.4921}
The average score is: 0.7290911608987622


In [24]:
ensemble_retriever_2 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_6], weights=[0.5, 0.5])
result_ensemble2 = run_and_evaluate(ensemble_retriever_2, prompt, llm)
print("Ensambler 50/50")
print(result_ensemble2)
avg_result_ensemble2 = dictionary(result_ensemble2)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.79it/s]


Ensambler 50/50
{'faithfulness': 0.9500, 'answer_relevancy': 0.7810, 'context_precision': 0.5291, 'context_recall': 0.8000, 'answer_similarity': 0.8974, 'answer_correctness': 0.4897}
The average score is: 0.7411992190195104


In [25]:
ensemble_retriever_3 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_6], weights=[0.25,0.75])
result_ensemble3 = run_and_evaluate(ensemble_retriever_3, prompt, llm)
print("Ensambler 25/75")
print(result_ensemble3)
avg_result_ensemble3 = dictionary(result_ensemble3)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'low'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'me

In [15]:
ensemble_retriever_4 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_mmr], weights=[0.25,0.75])
result_ensemble4 = run_and_evaluate(ensemble_retriever_4, prompt, llm)
print("Ensambler 25/75")
print(result_ensemble4)
avg_result_ensemble4 = dictionary(result_ensemble4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'low'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'me

Ensambler 25/75
{'faithfulness': 0.9220, 'answer_relevancy': 0.5278, 'context_precision': 0.8992, 'context_recall': 0.7194, 'answer_similarity': 0.6978, 'answer_correctness': 0.8041}


NameError: name 'dictionary' is not defined

In [17]:
avg_result_ensemble4 = dictionary(result_ensemble4)

The average score is: 0.761723666299702


In [18]:
ensemble_retriever_5 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_mmr], weights=[0.5,0.5])
result_ensemble5 = run_and_evaluate(ensemble_retriever_5, prompt, llm)
print("Ensambler 50/50")
print(result_ensemble5)
avg_result_ensemble5 = dictionary(result_ensemble5)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.84it/s]


Ensambler 50/50
{'faithfulness': 0.9167, 'answer_relevancy': 0.7830, 'context_precision': 0.5544, 'context_recall': 0.8889, 'answer_similarity': 0.8966, 'answer_correctness': 0.5320}
The average score is: 0.7619267999961684


In [19]:
ensemble_retriever_6 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_mmr], weights=[0.75,0.25])
result_ensemble6 = run_and_evaluate(ensemble_retriever_6, prompt, llm)
print("Ensambler 75/25")
print(result_ensemble6)
avg_result_ensemble6 = dictionary(result_ensemble6)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.93it/s]


Ensambler 75/25
{'faithfulness': 0.8921, 'answer_relevancy': 0.7009, 'context_precision': 0.5021, 'context_recall': 0.8000, 'answer_similarity': 0.9053, 'answer_correctness': 0.5154}
The average score is: 0.7193036932050777


#### Multi-stage - reranker

In [20]:
os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")

In [22]:
retriever_context = retriever_mmr
compressor = CohereRerank(top_n=6)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever_context
)

result_compression = run_and_evaluate(compression_retriever, prompt, llm)
print("Reranker")
print(result_compression)
avg_result_compression = dictionary(result_compression)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  60%|██████    | 36/60 [00:06<00:02,  9.59it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.51it/s]


Reranker
{'faithfulness': 0.7556, 'answer_relevancy': 0.6267, 'context_precision': 0.6985, 'context_recall': 0.9000, 'answer_similarity': 0.9001, 'answer_correctness': 0.5243}
The average score is: 0.7341855512490597


#### creating context by remaking the query

In [24]:
template_context = "Generate a search query to fetch the relevant documents using the user's {question}. Craft a query that specifically targets the keywords in the question. In the answer provide only the query."
prompt_context = ChatPromptTemplate.from_template(template_context)

In [25]:
answers_final = []
contexts_final = []
llm_for_context =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt_context | llm}
)
for query in questions:
    response_check = llm_for_context.invoke({"question": query})
    search_query = response_check["response"].content
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("context"), "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
)
    docs = retriever_mmr.get_relevant_documents(search_query)
    formatted_docs = []
    for doc in docs:
        resulting_doc = doc.page_content
        formatted_docs.append(resulting_doc)
    try:  
            response = retrieval_augmented_qa_chain.invoke({"context": formatted_docs, "question": query})
            # Access the response content
            answers_final.append(response["response"].content)
            contexts_final.append(formatted_docs)  
    except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_final.append("No answer")
            contexts_final.append(formatted_docs)


result_search_query = evaluation_rag(questions, answers_final, contexts_final, ground_truths)
print(result_search_query)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.66it/s]


{'faithfulness': 0.9055, 'answer_relevancy': 0.4621, 'context_precision': 0.6764, 'context_recall': 0.7257, 'answer_similarity': 0.8721, 'answer_correctness': 0.4000}


In [26]:
avg_result_search_query = dictionary(result_search_query)

The average score is: 0.6736276695602351


### change model to GPT-4

In [27]:
answers_final_gpt4 = []
contexts_final_gpt4 = []
result_mmr_gpt4 = run_and_evaluate(retriever_mmr, prompt, llm_gpt4)
print("Marginal relevance")
print(result_mmr_gpt4)
average_score_result_mmr_gpt4 = dictionary(result_mmr_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:11<00:00,  5.08it/s]


Marginal relevance
{'faithfulness': 0.9042, 'answer_relevancy': 0.7370, 'context_precision': 0.6381, 'context_recall': 0.9500, 'answer_similarity': 0.8971, 'answer_correctness': 0.5400}
The average score is: 0.777724376964276
