In [2]:
import dotenv
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from langchain_openai import AzureOpenAIEmbeddings
dotenv.load_dotenv()
from langchain_community.vectorstores import Chroma
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_similarity,
    answer_correctness,
)
from datasets import Dataset
from langchain_openai import AzureChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
import os
from langchain.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
import getpass
from langchain.retrievers.document_compressors import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.document_loaders import PyPDFLoader
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
import sys
sys.tracebacklimit = 0

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION")
AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
OPENAI_MODEL = os.environ.get("OPENAI_MODEL")
OPENAI_DEPLOYMENT = os.environ.get("OPENAI_DEPLOYMENT")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL")
EMBEDDING_DEPLOYMENT = os.environ.get("EMBEDDING_DEPLOYMENT")
OPENAI_MODEL_GPT4 = os.environ.get("OPENAI_MODEL_GPT4")
OPENAI_DEPLOYMENT_GPT4 = os.environ.get("OPENAI_DEPLOYMENT_GPT4")

In [4]:
embeddings_client = AzureOpenAIEmbeddings(
    azure_deployment=EMBEDDING_DEPLOYMENT,
    openai_api_version=OPENAI_API_VERSION)
llm = AzureChatOpenAI(model_name=OPENAI_MODEL, azure_deployment=OPENAI_DEPLOYMENT,temperature=0)
llm_gpt4 = AzureChatOpenAI(model_name=OPENAI_MODEL_GPT4, azure_deployment=OPENAI_DEPLOYMENT_GPT4,temperature=0)

In [5]:
def evaluation_llm(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
        raise_exceptions=False,
    )
    return result

In [6]:
def evaluation_rag(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            context_precision,
            context_recall,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
        raise_exceptions=False,
    )
    return result

In [7]:
questions = [
    "Write a news report that starts as follows: The Preprint titled 'The Chronicles of RAG: The Retriever, the Chunk and the Generator' delves into cutting-edge strategies for implementing the Retrieval Augmented Generation (RAG) system tailored for the Brazilian Portuguese language. Written by a team of researchers from Brazil, the paper addresses the challenges and advancements in optimizing RAG systems for improved text generation in response to user queries.",
    "Write a short report that starts as follows: MultiHop-RAG is a benchmarking dataset designed for evaluating Retrieval-Augmented Generation (RAG) systems for multi-hop queries.",
    "Write a short report that starts as follows: In recent years, the integration of Large Language Models (LLMs) has significantly advanced generative AI services, notably with the Retrieval-Augmented Generation (RAG) model. This report initiates a comprehensive exploration, evaluating RAG's impact through the Retrieval-Augmented Generation Benchmark (RGB) and its application in the Dialog System Technology Challenge 9 (DSTC9). Additionally, we delve into innovative approaches like GENREAD, which enhances LLMs' capabilities for tasks such as form filling and multi-hop query resolution, aiming to provide actionable insights for leveraging these technologies effectively.",
    "Finish the following idea: The CRUD framework offers a structured approach to assessing the capabilities and limitations of Retrieval-Augmented Generation (RAG) systems in different text generation scenarios.",
    "Write a short report that starts as follows: HybridRAG is a novel approach that efficiently combines a cloud-based Large Language Model (LLM) with a smaller client-side language model through retrieval augmented memory to deliver real-time composition assistance. It allows the client model to generate effective responses by leveraging the LLM's capabilities and contextual information.",
    "Finish writing the following short report: MetaRAG is introduced as a framework that combines the retrieval-augmented generation process with metacognitive abilities, inspired by human cognitive processes. The integration of Metacognition in MetaRAG allows the model to monitor, evaluate, and plan its response strategies, enhancing introspective reasoning and self-awareness in the cognitive process. This approach enables the model to identify shortcomings in the reasoning process and aims to enhance the accuracy of answer derivation.",
    "Write a report that starts as follows: RAGTruth is a hallucination corpus specifically designed for analyzing word-level hallucinations in diverse tasks within the Retrieval-Augmented Generation (RAG) context for Language Models. This corpus encompasses nearly 18,000 naturally generated responses from various models using RAG. Each response has undergone meticulous manual annotation at both individual cases and word levels, evaluating the intensity of hallucinations.",
    "Write a short news report that starts as follows: The PaperQA system revolutionizes scientific research with its cutting-edge capabilities in answering complex scientific questions.",
    "Finish writing the following report: Various evaluation metrics play a crucial role in assessing the performance of Retrieval-Augmented Generation (RAG) systems across different text generation scenarios. Methods that can be used are reference-free, evaluation for retrievers, evaluation for embedding models, RagQuestEval Metrics, evaluation based on chunk size, evaluation of LLMs as well as end-to-end evaluation metrics.",
    "Finish writing the following report: Retrievers are a crucial component in the field of natural language processing, especially in the context of information retrieval and question answering systems. They play a key role in enabling models to access external knowledge sources efficiently and effectively. Retrievers are often utilized to retrieve and present relevant documents or passages from large corpora, such as Wikipedia, to aid in answering queries or generating responses. The integration of retrievers in language models, such as in the Framework Retrieval-Augmented Language Model (REALM), allows for the retrieval of knowledge from external sources during pre-training, fine-tuning, and inference stages."
]

ground_truths = [
    ["The Preprint titled 'The Chronicles of RAG: The Retriever, the Chunk and the Generator' delves into cutting-edge strategies for implementing the Retrieval Augmented Generation (RAG) system tailored for the Brazilian Portuguese language. Written by a team of researchers from Brazil, the paper addresses the challenges and advancements in optimizing RAG systems for improved text generation in response to user queries. In a meticulous review of RAG techniques, the researchers highlight the significance of effective integration of retrieval models, data representation learning, computational efficiency optimization, and the importance of quality text generation during RAG implementation. Through a series of experiments focused on the Harry Potter book dataset, different retrieval techniques were tested, including sparse and dense retrievers, sentence chunking strategies, and the impact of document positioning within prompts on overall content relevance. The results showcased a significant enhancement in the RAG system's quality, with a notable improvement in Mean Reciprocal Rank (MRR@10) and recall metrics when using hybrid search techniques that combine both sparse and dense retrieval methods. The hybrid approach demonstrated the potential to further enhance the accuracy and relevance of responses generated by RAG systems in Brazilian Portuguese-related tasks. Moreover, the study introduced a new metric called the relative maximum score, which allows for a direct comparison of the performance of different LLMs within the RAG system. The research aims to provide valuable insights and practical guidelines for developers looking to implement RAG in Brazilian Portuguese applications, with a focus on simplifying the pipeline for inference and experiments. Overall, the paper sheds light on the evolving landscape of RAG systems and offers key recommendations for optimizing performance and quality in Brazilian Portuguese text generation applications. This research opens new avenues for the application of advanced retrieval techniques in enhancing the capabilities of language models for diverse language tasks."],
    ["MultiHop-RAG is a benchmarking dataset designed for evaluating Retrieval-Augmented Generation (RAG) systems for multi-hop queries. This dataset includes a knowledge base, a collection of multi-hop queries, their ground-truth answers, and associated supporting evidence. Multi-hop queries are those that require retrieving and reasoning over multiple pieces of evidence from various sources to formulate a response. MultiHop-RAG addresses the limitations of existing RAG systems by focusing on complex multi-hop queries and providing a resource for developing effective RAG frameworks, ultimately enhancing the adoption of Large Language Models (LLMs) in practical applications. The dataset encompasses various query types, such as inference, comparison, temporal, and null queries, each testing the retrieval and reasoning capabilities of the LLMs in handling complex information processing tasks. Additionally, MultiHop-RAG facilitates benchmarking of RAG systems by offering a diverse set of queries and evidential support challenging the retrieval and reasoning abilities of various state-of-the-art LLMs. The dataset represents a valuable resource for the NLP community to advance RAG technologies and harness the potential of generative AI in real-world applications."],
    ["In recent years, the integration of Large Language Models (LLMs) has significantly advanced generative AI services, notably with the Retrieval-Augmented Generation (RAG) model. This report initiates a comprehensive exploration, evaluating RAG's impact through the Retrieval-Augmented Generation Benchmark (RGB) and its application in the Dialog System Technology Challenge 9 (DSTC9). Additionally, we delve into innovative approaches like GENREAD, which enhances LLMs' capabilities for tasks such as form filling and multi-hop query resolution, aiming to provide actionable insights for leveraging these technologies effectively. Generative AI Services: The study proposes a method for implementing generative AI services using Large Language Models (LLMs). Addressing Information Scarcity: The research addresses the challenge of information scarcity by utilizing LLM capabilities and proposes remedies for inadequate data. Fine-tuning and Document Integration: Strategies like fine-tuning techniques and direct document integration are explored to mitigate data insufficiency. Retrieval-Augmented Generation (RAG): Introduces the RAG model designed to enhance information storage, retrieval processes, and content generation. Information Storage and Retrieval Methodology: The study outlines key phases of the information storage and retrieval methodology supported by the RAG model. Evaluation of RAG Impact on LLMs: Analyzes the impact of Retrieval-Augmented Generation on different LLMs, focusing on abilities like noise robustness, negative rejection, information integration, and counterfactual robustness. Retrieval-Augmented Generation Benchmark (RGB): A corpus for evaluating RAG in English and Chinese, divided into testbeds for various abilities required for RAG. Dialog System Technology Challenge (DSTC 9): Discusses work on the DSTC 9 track focused on task-oriented conversational modeling with unstructured knowledge access. Automated Form Filling: Proposes adapting LLMs for automated form completion tasks by creating a knowledge base, augmenting LLMs with retrieval-augmented generation, and using prompt engineering techniques. Generate-then-Read (GENREAD): Presents a method for solving knowledge-intensive tasks by generating contextual documents using LLMs before reading them for answers. Hybrid Retrieval-Augmented Generation (HybridRAG): Introduces a framework combining cloud-based LLMs with client-side models through retrieval-augmented memory for real-time task efficiency. MetaRAG: Integrates retrieval-augmented generation with metacognition to enhance model reasoning abilities through self-reflection and critical evaluation. MultiHop-RAG Dataset: Develops a dataset focusing on multi-hop queries to address inadequacies in existing RAG systems, aiming to facilitate the development of effective RAG systems for LLMs."],
    ["The CRUD framework offers a structured approach to assessing the capabilities and limitations of Retrieval-Augmented Generation (RAG) systems in different text generation scenarios. By categorizing tasks into Create, Read, Update, and Delete types, CRUD provides a comprehensive evaluation method for RAG systems. This framework allows for targeted analysis of how external knowledge sources enhance text generation in various contexts, enabling researchers and developers to optimize RAG systems more effectively."],
    ["HybridRAG is a novel approach that efficiently combines a cloud-based Large Language Model (LLM) with a smaller client-side language model through retrieval augmented memory to deliver real-time composition assistance. It allows the client model to generate effective responses by leveraging the LLM's capabilities and contextual information. The framework comprises four main components: 1. Augmentation coordinator (client): Monitors the writing context, requests augmented memory from the cloud, and integrates it into the client model 2. Memory-augmented client model (client): Utilizes the augmented memory to suggest the next words, enhancing its overall performance. 3. Retriever model (cloud): Searches the retrieval corpus to find relevant data based on the user input. 4. LLM-based memory generator (cloud): Constructs concise key takeaways from the retrieved data to optimize memory usefulness. The HybridRAG framework offers multiple benefits: 1. Enhanced utility: Enables the client model to provide better suggestions by utilizing cloud-based resources. 2. Low latency: Ensures low latency by asynchronously updating memory, allowing the client model to make predictions swiftly without waiting for responses from the cloud. 3. Reduced client-to-cloud communication: Minimizes communication between the client and cloud, saving costs by only requesting augmented memory when needed. 4. Efficient memory integration: Integrates cloud-generated memory into the client model to enhance performance and reduce data transfer volume. By leveraging cloud-generated memory, HybridRAG efficiently enhances the utility of small language models on client devices through retrieval augmentation, operating asynchronously. This innovative framework bridges the gap between latency challenges and model performance, making it a valuable solution for real-time tasks like composition assistance."],
    ["MetaRAG is introduced as a framework that combines the retrieval-augmented generation process with metacognitive abilities, inspired by human cognitive processes. The integration of Metacognition in MetaRAG allows the model to monitor, evaluate, and plan its response strategies, enhancing introspective reasoning and self-awareness in the cognitive process. This approach enables the model to identify shortcomings in the reasoning process and aims to enhance the accuracy of answer derivation. The overall MetaRAG framework includes two spaces: the cognition space and the metacognition space. The cognition space functions as a Question Answering (QA) system, while the metacognition space serves as both an evaluator and critic, introspecting the reasoning process. The metacognition space primarily encompasses three main phases: Monitoring, Evaluating, and Planning. The monitoring phase assesses the quality of the current response to determine the need for metacognitive evaluation. The evaluating phase identifies reasons why the current answer may not meet the requirements, utilizing metacognitive knowledge to analyze flaws in the response. In summary, MetaRAG is a framework that integrates metacognition into retrieval-augmented generation processes. By leveraging these metacognitive capabilities, the model can enhance its reasoning abilities, identify errors in its responses, and optimize its cognitive processes for more accurate answer generation."],
    ["RAGTruth is a hallucination corpus specifically designed for analyzing word-level hallucinations in diverse tasks within the Retrieval-Augmented Generation (RAG) context for Language Models. This corpus encompasses nearly 18,000 naturally generated responses from various models using RAG. Each response has undergone meticulous manual annotation at both individual cases and word levels, evaluating the intensity of hallucinations. RAGTruth serves as a benchmark dataset to measure the extent of hallucination within Large Language Models (LLMs) and assess existing strategies for hallucination detection. It provides a foundation for developing effective hallucination prevention methods under RAG scenarios, aiming to enhance the trustworthiness of LLM applications. With a focus on the effectiveness of hallucination detection methodologies, RAGTruth enables thorough comparisons across different LLMs and tasks. Additionally, the report emphasizes the significance of high-quality datasets like RAGTruth in fine-tuning relatively smaller LLMs to achieve competitive performance in hallucination detection, underscoring the potential for advancing detection methods using specialized datasets. With detailed annotations and diverse responses across tasks, RAGTruth facilitates a comprehensive understanding of hallucination phenomena in LLMs, contributing to the ongoing research in this critical area."],
    ["The PaperQA system revolutionizes scientific research with its cutting-edge capabilities in answering complex scientific questions. Developed by a team of researchers, PaperQA utilizes advanced tools and technologies to retrieve relevant information from online databases like Arxiv and Pubmed. By synthesizing data from research papers, PaperQA provides accurate answers comparable to human experts. Its innovative approach includes decomposing tasks, utilizing map summarization, and leveraging advanced language models to enhance retrieval and reasoning processes. PaperQA's cost-effectiveness, accuracy, and reduced error rates make it a valuable asset for scientific exploration. This breakthrough system promises to expedite research processes and reduce costs in the natural sciences, paving the way for faster innovation and discovery."],
    ["Various evaluation metrics play a crucial role in assessing the performance of Retrieval-Augmented Generation (RAG) systems across different text generation scenarios.  Methods that can be used are reference-free, evaluation for retrievers, evaluation for embedding models, RagQuestEval Metrics, evaluation based on chunk size, evaluation of LLMs as well as end-to-end evaluation metrics. Reference-Free Evaluation Methods: The utilization of metrics like TruLens-Eval, RAGAS, and ARES is discussed for evaluating RAG systems' performance concerning context relevance, answer faithfulness, and answer relevance. Evaluation of Retrievers: Experimental results incorporate metrics such as BLEU, ROUGE-L, bertScore, and RAGQuestEval to evaluate different retrievers' performance in tasks like summarization and question-answering. Evaluation of Embedding Models: Experimental results evaluate different embedding models based on metrics such as BLEU, ROUGE-L, bertScore, and RAGQuestEval for tasks like text continuation and summarization. RAGQuestEval Metric: The RAGQuestEval metric measures the recall and precision of key information in the generated text compared to the ground truth references. Evaluation based on Chunk Size: Evaluation of different chunk sizes utilizes metrics like BLEU, ROUGE-L, bertScore, and RAGQuestEval for text continuation and summarization tasks. Evaluation of Large Language Models: Experimental results showcase the evaluation of different large language models using metrics such as BLEU, ROUGE-L, bertScore, and RAGQuestEval for text continuation and summarization tasks. End-to-End Evaluation Method: An end-to-end evaluation method is adopted to assess RAG systems' ability to retrieve relevant documents and generate sensible responses, comparing the similarity between model output and reference answers. Moreover, the report introduces the Retrieval-Augmented Generation Benchmark (RGB) to evaluate Large Language Models' (LLMs) abilities in retrieval-augmented generation tasks. The benchmark requires RAG systems to exhibit Noise Robustness, Negative Rejection, Information Integration, and Counterfactual Robustness for accurate responses. Data construction involves using actual news articles to simulate real-world scenarios for evaluating LLMs by judging their retrieval-augmented responses to questions. Evaluation metrics used in the RGB include Accuracy for noise robustness and information integration, Rejection Rate for negative rejection, and Error Detection Rate for counterfactual robustness. The paper also discusses the impact of noise on LLMs' performance in RAG tasks, highlighting common errors such as long-distance information issues, evidence uncertainty, and concept confusion."],
    ["Retrievers are a crucial component in the field of natural language processing, especially in the context of information retrieval and question answering systems. They play a key role in enabling models to access external knowledge sources efficiently and effectively. Retrievers are often utilized to retrieve and present relevant documents or passages from large corpora, such as Wikipedia, to aid in answering queries or generating responses. The integration of retrievers in language models, such as in the Framework Retrieval-Augmented Language Model (REALM), allows for the retrieval of knowledge from external sources during pre-training, fine-tuning, and inference stages. This retrieval process involves selecting and attending over documents from a corpus to inform the predictions or responses generated by the model. Retrievers in systems like REALM are trained using unsupervised techniques, such as masked language modeling as the learning signal, and backpropagation through a retrieval step that considers millions of documents. The retriever is designed to reward informative retrievals that improve model predictions and penalize uninformative retrievals, thereby encouraging the model to retrieve relevant knowledge. One of the main challenges in incorporating retrievers in neural networks is the computational complexity involved, especially when considering a large corpus of documents. Strategies such as caching and asynchronous updates, as well as approaches like Maximum Inner Product Search (MIPS), are employed to address these challenges and optimize the retrieval process efficiently. Furthermore, retrievers are instrumental in tasks like open-domain question answering, where models need to access a vast amount of external knowledge to provide accurate responses. By enabling models to retrieve and incorporate relevant information from a knowledge corpus, retrievers enhance the performance and accuracy of the overall system, as demonstrated by the significant improvements achieved in benchmarks like NaturalQuestions-Open, WebQuestions, and CuratedTrec. In conclusion, retrievers are fundamental components in language models that facilitate the retrieval of external knowledge for various NLP tasks. Their effective integration and training are essential for enhancing model performance, enabling better understanding, and interpretation of textual data across a wide range of applications."]
]

### General answer by LLMs

In [8]:
max_average = 0.9120812180691525
def find_highest(average_score):
    global max_average
    if average_score > max_average:
        max_average = average_score
        print("This is the new best value!")

In [17]:
def dictionary(result):
    dict_result = dict(result)
    average_score = sum(dict_result.values()) / len(dict_result)
    print(f"The average score is: {average_score}")
    find_highest(average_score)
    return dict_result, average_score

In [9]:
template = """{question}"""
prompt = ChatPromptTemplate.from_template(template)

In [10]:
llm_chain =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm}
)
llm_chain_gpt4 =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm_gpt4}
)

In [11]:
answers_llm = []
contexts_llm = [[""],[""],[""],[""],[""],[""],[""],[""],[""],[""]]

In [12]:
for query in questions:
    response = llm_chain.invoke({"question": query})
    answers_llm.append(response["response"].content)

In [15]:
llm_results = evaluation_llm(questions, answers_llm, contexts_llm, ground_truths)
print(llm_results)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 40/40 [00:11<00:00,  3.63it/s]


{'faithfulness': 0.6831, 'answer_relevancy': 0.9038, 'answer_similarity': 0.9177, 'answer_correctness': 0.7243}


In [16]:
dict_llm_results, avg_llm_results = dictionary(llm_results)

The average score is: 0.8072154284311781
This is the new best value!


In [17]:
answers_llm_gpt4 = []
for query in questions:
    response = llm_chain_gpt4.invoke({"question": query})
    answers_llm_gpt4.append(response["response"].content)
llm_results_gpt4 = evaluation_llm(questions, answers_llm_gpt4, contexts_llm, ground_truths)
print(llm_results_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   2%|▎         | 1/40 [00:04<02:50,  4.37s/it]Runner in Executor raised an exception
TypeError: expected string or buffer
Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 40/40 [00:16<00:00,  2.37it/s]


{'faithfulness': 0.7551, 'answer_relevancy': 0.8502, 'answer_similarity': 0.9499, 'answer_correctness': 0.7826}


In [18]:
dict_llm_results_gpt4, avg_llm_results_gpt4 = dictionary(llm_results_gpt4)

The average score is: 0.8344622643459034
This is the new best value!


# RAG

### Naive RAG

In [9]:
def retrieval_chain(prompt, retriever, llm):
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
    )
    return retrieval_augmented_qa_chain

In [10]:
loader = DirectoryLoader('../papers', glob="./*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = CharacterTextSplitter()
chunks = text_splitter.split_documents(documents)
db_naive = Chroma.from_documents(chunks, embeddings_client, persist_directory = "../papers/vectordb/naive")
db_naive.persist()
retriever_naive = db_naive.as_retriever()

In [21]:
db_naive = Chroma(persist_directory = "../papers/vectordb/naive", embedding_function=embeddings_client)
retriever_naive = db_naive.as_retriever()

In [11]:
template = """{question}. 
Provided context {context}."""
prompt = ChatPromptTemplate.from_template(template)

In [23]:
answers_naive = []
contexts_naive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_naive, llm).invoke({"question": query})
        # Access the response content
        answers_naive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_naive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_naive.append("No answer")
        context_full = retriever_naive.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_naive.append(context_content)

In [24]:
result_naive_rag = evaluation_rag(questions, answers_naive, contexts_naive, ground_truths)
print(result_naive_rag)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  27%|██▋       | 16/60 [00:05<00:13,  3.37it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:16<00:00,  3.57it/s]


{'faithfulness': 0.7213, 'answer_relevancy': 0.9169, 'context_precision': 0.9043, 'context_recall': 0.9500, 'answer_similarity': 0.9883, 'answer_correctness': 0.8164}


In [25]:
dict_result_naive_rag, avg_result_naive_rag = dictionary(result_naive_rag)

The average score is: 0.8828677705712465
This is the new best value!


### recursive splitter

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder()
chunks_r = text_splitter.split_documents(documents)
db_basic = Chroma.from_documents(chunks_r, embeddings_client, persist_directory = "../papers/vectordb/recursive_basic")
db_basic.persist()
retriever_basic = db_basic.as_retriever()

In [26]:
db_basic = Chroma(persist_directory = "../papers/vectordb/recursive_basic", embedding_function=embeddings_client)
retriever_basic = db_basic.as_retriever()

In [27]:
answers_recursive = []
contexts_recursive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_basic, llm).invoke({"question": query})
        # Access the response content
        answers_recursive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_recursive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_recursive.append("No answer")
        context_full = retriever_basic.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_recursive.append(context_content)

In [28]:
result_recursive = evaluation_rag(questions, answers_recursive, contexts_recursive, ground_truths)
print(result_recursive)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  27%|██▋       | 16/60 [00:06<00:13,  3.27it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.96it/s]


{'faithfulness': 0.7028, 'answer_relevancy': 0.8870, 'context_precision': 0.8987, 'context_recall': 0.9252, 'answer_similarity': 0.9892, 'answer_correctness': 0.8153}


In [29]:
dict_result_recursive, avg_result_recursive= dictionary(result_recursive)

The average score is: 0.8696745382065824


### chunk sizes

In [12]:
def run_and_evaluate(retriever, prompt, llm):
    answers_recursive = []
    contexts_recursive = []

    for query in questions:
        try:  
            response = retrieval_chain(prompt, retriever, llm).invoke({"question": query})
            # Access the response content
            answers_recursive.append(response["response"].content)
            # Access the context content
            context_content = [context.page_content for context in response["context"]]
            contexts_recursive.append(context_content)  
        except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_recursive.append("No answer")
            context_full = retriever.get_relevant_documents(query)
            context_content = [context.page_content for context in context_full]
            contexts_recursive.append(context_content)


    result = evaluation_rag(questions, answers_recursive, contexts_recursive, ground_truths)
    return result

chunk = 1000

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 100)
chunks_1000 = text_splitter.split_documents(documents)
print(len(chunks_1000))
db_1000 = Chroma.from_documents(chunks_1000, embeddings_client, persist_directory = "../papers/vectordb/recursive_1000")
db_1000.persist()

In [30]:
db_1000 = Chroma(persist_directory = "../papers/vectordb/recursive_1000", embedding_function=embeddings_client)

In [32]:
retriever_1000 = db_1000.as_retriever()
result_1000 = run_and_evaluate(retriever_1000, prompt, llm)
print("CHUNK SIZE 1000")
print(result_1000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  28%|██▊       | 17/60 [00:06<00:12,  3.38it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating:  48%|████▊     | 29/60 [00:07<00:05,  5.83it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:16<00:00,  3.59it/s]


CHUNK SIZE 1000
{'faithfulness': 0.8905, 'answer_relevancy': 0.7862, 'context_precision': 0.8855, 'context_recall': 0.8967, 'answer_similarity': 0.9478, 'answer_correctness': 0.9411}


In [33]:
dict_1000, avg_1000 = dictionary(result_1000)

The average score is: 0.8912987544903516
This is the new best value!


chunk = 500

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 500, chunk_overlap = 50)
chunks_500 = text_splitter.split_documents(documents)
print(len(chunks_500))
db_500 = Chroma.from_documents(chunks_500, embeddings_client, persist_directory = "../papers/vectordb/recursive_500")
db_500.persist()

In [14]:
db_500 = Chroma(persist_directory = "../papers/vectordb/recursive_500", embedding_function=embeddings_client)

In [35]:
retriever_500 = db_500.as_retriever()
result_500 = run_and_evaluate(retriever_500, prompt, llm)
print("CHUNK SIZE 500")
print(result_500)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:16<00:00,  3.64it/s]


CHUNK SIZE 500
{'faithfulness': 0.9167, 'answer_relevancy': 0.8880, 'context_precision': 0.9917, 'context_recall': 1.0000, 'answer_similarity': 0.9790, 'answer_correctness': 0.6891}


In [36]:
dict_result_500, avg_500 = dictionary(result_500)

The average score is: 0.9107365573781259
This is the new best value!


chunk = 2000

In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 2000, chunk_overlap = 200)
chunks_2000 = text_splitter.split_documents(documents)
print(len(chunks_2000))
db_2000 = Chroma.from_documents(chunks_2000, embeddings_client, persist_directory = "../papers/vectordb/recursive_2000")


In [37]:
db_2000 = Chroma(persist_directory = "../papers/vectordb/recursive_2000", embedding_function=embeddings_client)

In [38]:
retriever_2000 = db_2000.as_retriever()
result_2000 = run_and_evaluate(retriever_2000, prompt, llm)
print("CHUNK SIZE 2000")
print(result_2000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.11it/s]


CHUNK SIZE 2000
{'faithfulness': 0.8900, 'answer_relevancy': 0.8478, 'context_precision': 0.9806, 'context_recall': 0.9246, 'answer_similarity': 0.9734, 'answer_correctness': 0.5669}


In [39]:
dict_result_2000, avg_result_2000 = dictionary(result_2000)

The average score is: 0.8638815240649625


chunk = 3000

In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 3000, chunk_overlap = 300)
chunks_3000 = text_splitter.split_documents(documents)
print(len(chunks_3000))
db_3000 = Chroma.from_documents(chunks_3000, embeddings_client, persist_directory = "../papers/vectordb/recursive_3000")

In [40]:
db_3000 = Chroma(persist_directory = "../papers/vectordb/recursive_3000", embedding_function=embeddings_client)

In [41]:
retriever_3000 = db_3000.as_retriever()
result_3000 = run_and_evaluate(retriever_3000, prompt, llm)
print("CHUNK SIZE 3000")
print(result_3000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  23%|██▎       | 14/60 [00:05<00:16,  2.74it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:16<00:00,  3.59it/s]


CHUNK SIZE 3000
{'faithfulness': 0.7440, 'answer_relevancy': 0.9093, 'context_precision': 0.9021, 'context_recall': 0.9500, 'answer_similarity': 0.9893, 'answer_correctness': 0.8170}


In [42]:
dict_result_3000, avg_result_3000 = dictionary(result_3000)

The average score is: 0.8852887066826051


### now time to look for different top-k

Note: We continue with the original chunk size as it had the highest average score

In [43]:
retriever_3 = db_500.as_retriever(search_kwargs={"k": 3})

In [44]:
retriever_3 = db_basic.as_retriever(search_kwargs={"k": 3})
result_3 = run_and_evaluate(retriever_3, prompt, llm)
print("CHUNK SIZE 500, K=3")
print(result_3)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  20%|██        | 12/60 [00:05<00:21,  2.20it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:18<00:00,  3.33it/s]


CHUNK SIZE 1000, K=3
{'faithfulness': 0.7269, 'answer_relevancy': 0.9125, 'context_precision': 0.9214, 'context_recall': 0.9333, 'answer_similarity': 0.9359, 'answer_correctness': 0.8097}


In [45]:
dict_result_3, avg_result_3 = dictionary(result_3)

The average score is: 0.873289232722568


In [46]:
retriever_5 = db_500.as_retriever(search_kwargs={"k": 5})
result_5 = run_and_evaluate(retriever_5, prompt, llm)
print("CHUNK SIZE 500, K=5")
print(result_5)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  22%|██▏       | 13/60 [00:06<00:26,  1.75it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:16<00:00,  3.71it/s]


CHUNK SIZE 1000, K=5
{'faithfulness': 0.7492, 'answer_relevancy': 0.9259, 'context_precision': 0.9033, 'context_recall': 0.9783, 'answer_similarity': 0.9435, 'answer_correctness': 0.9005}


In [47]:
dict_result_5, avg_result_5 = dictionary(result_5)

The average score is: 0.900102494862172


In [48]:
retriever_6= db_500.as_retriever(search_kwargs={"k": 6})
result_6 = run_and_evaluate(retriever_6, prompt, llm)
print("CHUNK SIZE 500, K=5")
print(result_6)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  27%|██▋       | 16/60 [00:06<00:14,  3.05it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating:  32%|███▏      | 19/60 [00:06<00:10,  3.81it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:18<00:00,  3.22it/s]


CHUNK SIZE 500, K=5
{'faithfulness': 0.7299, 'answer_relevancy': 0.9467, 'context_precision': 0.9045, 'context_recall': 0.9670, 'answer_similarity': 0.9950, 'answer_correctness': 0.8770}


In [49]:
dict_result_6, avg_result_6 = dictionary(result_6)

The average score is: 0.9033520818395568


In [54]:
retriever_2= db_500.as_retriever(search_kwargs={"k": 2})
result_2 = run_and_evaluate(retriever_2, prompt, llm)
print("CHUNK SIZE 500, K=2")
print(result_2)
dict_result_2, avg_result_2 = dictionary(result_2)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  27%|██▋       | 16/60 [00:05<00:12,  3.47it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:16<00:00,  3.63it/s]


CHUNK SIZE 500, K=2
{'faithfulness': 0.6652, 'answer_relevancy': 0.9583, 'context_precision': 0.9069, 'context_recall': 1.0000, 'answer_similarity': 0.9839, 'answer_correctness': 0.9275}
The average score is: 0.9069654088026623


### look for different retrievers

4 chunks was the best score

#### parent document retriever

In [51]:
parent_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

# vectorstore = Chroma(collection_name="split_parents", persist_directory = "../papers/vectordb/parent_summary", embedding_function=embeddings_client)
# vectorstore.persist()
vectorstore = Chroma(persist_directory = "../papers/vectordb/parent_summary", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter
)
parent_document_retriever.add_documents(documents)
result_parent = run_and_evaluate(parent_document_retriever, prompt, llm)
print(result_parent)
dict_result_parent, avg_result_parent = dictionary(result_parent)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  65%|██████▌   | 39/60 [00:07<00:02, 10.20it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:18<00:00,  3.19it/s]


{'faithfulness': 0.9667, 'answer_relevancy': 0.8868, 'context_precision': 1.0000, 'context_recall': 0.9400, 'answer_similarity': 0.9758, 'answer_correctness': 0.5918}
The average score is: 0.8934964425047197


In [52]:
parent_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap = 50)
child_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap = 0)

# vectorstore = Chroma(collection_name="split_parents_small",persist_directory = "../papers/vectordb/parent_small_summary", embedding_function=embeddings_client)
# vectorstore.persist()
vectorstore = Chroma(persist_directory = "../papers/vectordb/parent_small_summary", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_small = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_small,
    parent_splitter=parent_splitter_small
)
parent_document_retriever_small.add_documents(documents)
result_parent_small = run_and_evaluate(parent_document_retriever_small, prompt, llm)
print(result_parent_small)
dict_result_parent_small, avg_result_parent_small = dictionary(result_parent_small)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  12%|█▏        | 7/60 [00:06<00:37,  1.40it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating:  13%|█▎        | 8/60 [00:09<00:54,  1.05s/it]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:18<00:00,  3.22it/s]


{'faithfulness': 0.9090, 'answer_relevancy': 0.7945, 'context_precision': 0.9875, 'context_recall': 0.9250, 'answer_similarity': 0.9803, 'answer_correctness': 0.7148}
The average score is: 0.8851767276348128


In [53]:
parent_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=3000)
child_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=300, chunk_overlap = 0)

# vectorstore = Chroma(collection_name="split_parents_large", persist_directory = "../papers/vectordb/parent_large_summary", embedding_function=embeddings_client)
# vectorstore.persist()
vectorstore = Chroma(persist_directory = "../papers/vectordb/parent_large_summary", embedding_function=embeddings_client)

store = InMemoryStore()
parent_document_retriever_large = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_large,
    parent_splitter=parent_splitter_large
)
parent_document_retriever_large.add_documents(documents)
result_parent_large = run_and_evaluate(parent_document_retriever_large, prompt, llm)
print(result_parent_large)
dict_result_parent_large, avg_result_parent_large= dictionary(result_parent_large)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  57%|█████▋    | 34/60 [00:05<00:02,  9.16it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.04it/s]


{'faithfulness': 0.9381, 'answer_relevancy': 0.8582, 'context_precision': 1.0000, 'context_recall': 0.9500, 'answer_similarity': 0.9749, 'answer_correctness': 0.6698}
The average score is: 0.8984896285502967


#### Maximum marginal relevance retrieval

In [55]:
retriever_mmr = db_500.as_retriever(search_type="mmr")
result_mmr = run_and_evaluate(retriever_mmr, prompt, llm)
print("Marginal relevance")
print(result_mmr)
dict_result_mmr ,average_score_mmr = dictionary(result_mmr)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:16<00:00,  3.59it/s]


Marginal relevance
{'faithfulness': 0.9857, 'answer_relevancy': 0.8853, 'context_precision': 0.9917, 'context_recall': 1.0000, 'answer_similarity': 0.9778, 'answer_correctness': 0.6180}
The average score is: 0.9097385413266386


#### BM25

In [24]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 500, chunk_overlap = 50)
chunks_bm25 = text_splitter.split_documents(documents)

In [57]:
retriever_bm25 = BM25Retriever.from_documents(chunks_bm25)
result_bm25 = run_and_evaluate(retriever_bm25, prompt, llm)
print("BM25")
print(result_bm25)
dict_result_bm25 ,average_score_bm25 = dictionary(result_bm25)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  50%|█████     | 30/60 [00:06<00:05,  5.93it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:17<00:00,  3.45it/s]


BM25
{'faithfulness': 1.0000, 'answer_relevancy': 0.8580, 'context_precision': 0.9750, 'context_recall': 0.9750, 'answer_similarity': 0.9791, 'answer_correctness': 0.6854}
The average score is: 0.9120812180691525
This is the new best value!


In [58]:
# try with chunk = 3000
text_splitter_3000 = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 3000, chunk_overlap = 300)
chunks_bm25_3000 = text_splitter_3000.split_documents(documents)
retriever_bm25_3000 = BM25Retriever.from_documents(chunks_bm25_3000)
result_bm25_3000 = run_and_evaluate(retriever_bm25_3000, prompt, llm)
print("BM25")
print(result_bm25_3000)
dict_result_bm25_3000 ,average_score_bm25_3000 = dictionary(result_bm25_3000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  18%|█▊        | 11/60 [00:04<00:14,  3.40it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.82it/s]


BM25
{'faithfulness': 0.9026, 'answer_relevancy': 0.9147, 'context_precision': 0.9762, 'context_recall': 0.9500, 'answer_similarity': 0.9761, 'answer_correctness': 0.6333}
The average score is: 0.892143524398603


In [59]:
text_splitter_1000 = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 100)
chunks_bm25_1000 = text_splitter_1000.split_documents(documents)
retriever_bm25_1000 = BM25Retriever.from_documents(chunks_bm25_1000)
result_bm25_1000 = run_and_evaluate(retriever_bm25_1000, prompt, llm)
print("BM25")
print(result_bm25_1000)
dict_result_bm25_1000 ,average_score_bm25_1000 = dictionary(result_bm25_1000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  30%|███       | 18/60 [00:05<00:09,  4.48it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.18it/s]


BM25
{'faithfulness': 0.7327, 'answer_relevancy': 0.9705, 'context_precision': 0.8880, 'context_recall': 0.9667, 'answer_similarity': 0.9852, 'answer_correctness': 0.8383}
The average score is: 0.8968975153804585


#### Ensambler - Hybrid

In [60]:
ensemble_retriever_1 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_500], weights=[0.75, 0.25])
result_ensemble1 = run_and_evaluate(ensemble_retriever_1, prompt, llm)
print("Ensambler 75/25")
print(result_ensemble1)
dict_result_ensemble1, average_score_ensambler1 = dictionary(result_ensemble1)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  30%|███       | 18/60 [00:05<00:10,  4.03it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.14it/s]


Ensambler 75/25
{'faithfulness': 0.7563, 'answer_relevancy': 0.9554, 'context_precision': 0.8717, 'context_recall': 0.9700, 'answer_similarity': 0.9790, 'answer_correctness': 0.7817}
The average score is: 0.8856747169775865


In [62]:
ensemble_retriever_2 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_500], weights=[0.5, 0.5])
result_ensemble2 = run_and_evaluate(ensemble_retriever_2, prompt, llm)
print("Ensambler 50/50")
print(result_ensemble2)
dict_result_ensemble2, avg_result_ensemble2 = dictionary(result_ensemble2)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  23%|██▎       | 14/60 [00:05<00:16,  2.83it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:20<00:00,  2.95it/s]


Ensambler 50/50
{'faithfulness': 0.7303, 'answer_relevancy': 0.9189, 'context_precision': 0.8804, 'context_recall': 0.9750, 'answer_similarity': 0.9844, 'answer_correctness': 0.8087}
The average score is: 0.882930628187829


In [61]:
ensemble_retriever_3 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_500], weights=[0.25,0.75])
result_ensemble3 = run_and_evaluate(ensemble_retriever_3, prompt, llm)
print("Ensambler 25/75")
print(result_ensemble3)
dict_result_ensemble3, avg_result_ensemble3 = dictionary(result_ensemble3)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
ValueError: Azure has not provided the response due to a content filter being triggered
Evaluating:  43%|████▎     | 26/60 [00:06<00:05,  6.38it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:13<00:00,  4.29it/s]


Ensambler 25/75
{'faithfulness': 0.6660, 'answer_relevancy': 0.9300, 'context_precision': 0.8720, 'context_recall': 0.9639, 'answer_similarity': 0.9936, 'answer_correctness': 0.9343}
The average score is: 0.8932884641331101


In [63]:
ensemble_retriever_4 = EnsembleRetriever(retrievers=[retriever_bm25_1000, retriever_1000], weights=[0.25,0.75])
result_ensemble4 = run_and_evaluate(ensemble_retriever_4, prompt, llm)
print("Ensambler 25/75")
print(result_ensemble4)
dict_result_ensemble4, avg_result_ensemble4 = dictionary(result_ensemble4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  18%|█▊        | 11/60 [00:03<00:12,  3.88it/s]Invalid JSON response. Expected dictionary with key 'Attributed'
Evaluating: 100%|██████████| 60/60 [00:19<00:00,  3.08it/s]


Ensambler 25/75
{'faithfulness': 1.0000, 'answer_relevancy': 0.8392, 'context_precision': 0.9623, 'context_recall': 1.0000, 'answer_similarity': 0.9702, 'answer_correctness': 0.5096}
The average score is: 0.8802195451473375


In [64]:
ensemble_retriever_5 = EnsembleRetriever(retrievers=[retriever_bm25_1000, retriever_1000], weights=[0.5,0.5])
result_ensemble5 = run_and_evaluate(ensemble_retriever_5, prompt, llm)
print("Ensambler 50/50")
print(result_ensemble5)
dict_result_ensemble5, avg_result_ensemble5 = dictionary(result_ensemble5)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  23%|██▎       | 14/60 [00:05<00:16,  2.71it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:16<00:00,  3.63it/s]


Ensambler 50/50
{'faithfulness': 0.9266, 'answer_relevancy': 0.6610, 'context_precision': 0.9469, 'context_recall': 0.8847, 'answer_similarity': 0.9869, 'answer_correctness': 0.9554}
The average score is: 0.8935946429495042


In [65]:
ensemble_retriever_6 = EnsembleRetriever(retrievers=[retriever_bm25_1000, retriever_1000], weights=[0.75,0.25])
result_ensemble6 = run_and_evaluate(ensemble_retriever_6, prompt, llm)
print("Ensambler 75/25")
print(result_ensemble6)
dict_result_ensemble6, avg_result_ensemble6 = dictionary(result_ensemble6)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.23it/s]


Ensambler 75/25
{'faithfulness': 0.9821, 'answer_relevancy': 0.8600, 'context_precision': 0.9525, 'context_recall': 0.9800, 'answer_similarity': 0.9756, 'answer_correctness': 0.6036}
The average score is: 0.8923023186849471


#### Multi-stage - reranker

In [13]:
os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")

In [15]:
retriever_500 = db_500.as_retriever()

In [21]:
retriever_context = retriever_500
compressor = CohereRerank()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever_context
)

result_compression = run_and_evaluate(compression_retriever, prompt, llm)
print("Reranker")
print(result_compression)
dict_result_compression,avg_result_compression = dictionary(result_compression)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.01it/s]


Reranker
{'faithfulness': 0.9792, 'answer_relevancy': 0.8664, 'context_precision': 1.0000, 'context_recall': 1.0000, 'answer_similarity': 0.9794, 'answer_correctness': 0.6023}
The average score is: 0.9045504305538364


In [18]:
dict_result_compression,avg_result_compression = dictionary(result_compression)

The average score is: 0.9045384007642943


In [25]:
retriever_bm25 = BM25Retriever.from_documents(chunks_bm25)
compression_retriever_bm25 = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever_bm25
)

result_compression_bm25 = run_and_evaluate(compression_retriever_bm25, prompt, llm)
print("Reranker")
print(result_compression_bm25)
dict_result_compression_bm25n,avg_result_compression_bm25 = dictionary(result_compression)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:16<00:00,  3.53it/s]


Reranker
{'faithfulness': 0.9238, 'answer_relevancy': 0.8700, 'context_precision': 0.9833, 'context_recall': 1.0000, 'answer_similarity': 0.9743, 'answer_correctness': 0.6775}
The average score is: 0.9045504305538364


#### creating context by remaking the query

In [19]:
template_context = "Generate a search query to fetch the relevant documents using the user's {question}. Craft a query that specifically targets the keywords in the question. In the answer provide only the query."
prompt_context = ChatPromptTemplate.from_template(template_context)

In [26]:
answers_final = []
contexts_final = []
llm_for_context =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt_context | llm}
)
for query in questions:
    response_check = llm_for_context.invoke({"question": query})
    search_query = response_check["response"].content
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("context"), "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
)
    docs = retriever_bm25.get_relevant_documents(search_query)
    formatted_docs = []
    for doc in docs:
        resulting_doc = doc.page_content
        formatted_docs.append(resulting_doc)
    try:  
            response = retrieval_augmented_qa_chain.invoke({"context": formatted_docs, "question": query})
            # Access the response content
            answers_final.append(response["response"].content)
            contexts_final.append(formatted_docs)  
    except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_final.append("No answer")
            contexts_final.append(formatted_docs)


result_search_query = evaluation_rag(questions, answers_final, contexts_final, ground_truths)
print(result_search_query)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  18%|█▊        | 11/60 [00:02<00:08,  5.60it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating:  23%|██▎       | 14/60 [00:04<00:13,  3.35it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating:  33%|███▎      | 20/60 [00:04<00:07,  5.37it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.23it/s]


{'faithfulness': 0.8565, 'answer_relevancy': 0.7526, 'context_precision': 0.8996, 'context_recall': 0.9014, 'answer_similarity': 0.6293, 'answer_correctness': 0.8716}


In [27]:
dict_result_search_query, avg_result_search_query = dictionary(result_search_query)

The average score is: 0.8185190879078648


### change model to GPT-4

In [28]:
retriever_500_gpt4 = db_500.as_retriever()
result_500_gpt4 = run_and_evaluate(retriever_500_gpt4, prompt, llm_gpt4)
print("CHUNK SIZE 500")
print(result_500_gpt4)
dict_result_500_gpt4, avg_500_gpt4 = dictionary(result_500_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  20%|██        | 12/60 [00:05<00:21,  2.28it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.73it/s]


CHUNK SIZE 500
{'faithfulness': 0.7356, 'answer_relevancy': 0.9547, 'context_precision': 0.9352, 'context_recall': 0.9917, 'answer_similarity': 0.9802, 'answer_correctness': 0.8172}
The average score is: 0.902424135822458


In [29]:
retriever_bm25_gpt4 = BM25Retriever.from_documents(chunks_bm25)
result_bm25_gpt4 = run_and_evaluate(retriever_bm25_gpt4, prompt, llm_gpt4)
print("BM25")
print(result_bm25_gpt4)
dict_result_bm25_gpt4 ,average_score_bm25_gpt4 = dictionary(result_bm25_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  25%|██▌       | 15/60 [00:05<00:12,  3.64it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating:  45%|████▌     | 27/60 [00:06<00:04,  6.90it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.81it/s]


BM25
{'faithfulness': 0.7679, 'answer_relevancy': 0.9448, 'context_precision': 0.9164, 'context_recall': 1.0000, 'answer_similarity': 0.9751, 'answer_correctness': 0.7939}
The average score is: 0.8996803898250875


In [34]:
ensemble_retriever_5_gpt4 = EnsembleRetriever(retrievers=[retriever_bm25_1000, retriever_1000], weights=[0.5,0.5])
result_ensemble5_gpt4 = run_and_evaluate(ensemble_retriever_5_gpt4, prompt, llm_gpt4)
print("Ensambler 50/50")
print(result_ensemble5_gpt4)
dict_result_ensemble5_gpt4, avg_result_ensemble5_gpt4 = dictionary(result_ensemble5_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  22%|██▏       | 13/60 [00:05<00:18,  2.51it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:13<00:00,  4.47it/s]


Ensambler 50/50
{'faithfulness': 1.0000, 'answer_relevancy': 0.8346, 'context_precision': 0.9504, 'context_recall': 1.0000, 'answer_similarity': 0.9699, 'answer_correctness': 0.7210}
The average score is: 0.9126237483708679
This is the new best value!


In [35]:
retriever_context = retriever_500
compressor = CohereRerank()
compression_retriever_gpt4 = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever_context
)

result_compression_gpt4 = run_and_evaluate(compression_retriever_gpt4, prompt, llm_gpt4)
print("Reranker")
print(result_compression_gpt4)
dict_result_compression_gpt4,avg_result_compression_gpt4 = dictionary(result_compression_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.68it/s]


Reranker
{'faithfulness': 0.9593, 'answer_relevancy': 0.8767, 'context_precision': 1.0000, 'context_recall': 1.0000, 'answer_similarity': 0.9681, 'answer_correctness': 0.5522}
The average score is: 0.8927103266823168
