In [1]:
import dotenv
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from langchain_openai import AzureOpenAIEmbeddings
dotenv.load_dotenv()
from langchain_community.vectorstores import Chroma
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_similarity,
    answer_correctness,
)
from datasets import Dataset
from langchain_openai import AzureChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
import os
from langchain.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
import getpass
from langchain.retrievers.document_compressors import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.document_loaders import PyPDFLoader
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
import sys
sys.tracebacklimit = 0

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION")
AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
OPENAI_MODEL = os.environ.get("OPENAI_MODEL")
OPENAI_DEPLOYMENT = os.environ.get("OPENAI_DEPLOYMENT")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL")
EMBEDDING_DEPLOYMENT = os.environ.get("EMBEDDING_DEPLOYMENT")
OPENAI_MODEL_GPT4 = os.environ.get("OPENAI_MODEL_GPT4")
OPENAI_DEPLOYMENT_GPT4 = os.environ.get("OPENAI_DEPLOYMENT_GPT4")

In [3]:
embeddings_client = AzureOpenAIEmbeddings(
    azure_deployment=EMBEDDING_DEPLOYMENT,
    openai_api_version=OPENAI_API_VERSION)
llm = AzureChatOpenAI(model_name=OPENAI_MODEL, azure_deployment=OPENAI_DEPLOYMENT,temperature=0)
llm_gpt4 = AzureChatOpenAI(model_name=OPENAI_MODEL_GPT4, azure_deployment=OPENAI_DEPLOYMENT_GPT4,temperature=0)

In [4]:
def evaluation_llm(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
    )
    return result

In [5]:
def evaluation_rag(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            context_precision,
            context_recall,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
        raise_exceptions=False,
    )
    return result

In [6]:
questions = [
    "What is the paper by Chen et al. about?",
    "What is the main focus and contribution of the paper by Cheonsu Jeong regarding generative AI technology, specifically in addressing information scarcity, and what is the significance of the proposed Retrieval-Augmented Generation (RAG) model in improving content generation within the Large Language Models (LLM) application architecture?",
    "What are the conclusions of CRUD-RAG: A Comprehensive Chinese Benchmark for Retrieval-Augmented Generation of Large Language Models?",
    "How does the proposed approach in the 'Efficient Retrieval Augmented Generation from Unstructured Knowledge for Task-Oriented Dialog' address the computational complexity issue in knowledge selection and what methods are employed in response generation to consider multiple knowledge snippets?",
    "What challenges and potential applications are discussed for large language models like GPT in the context of automated form completion and knowledge-intensive tasks, and what innovative methods are proposed to address these challenges in each scenario?",
    "Explain Metacognitive RAG",
    "What is the HybridRAG framework and its main components, and how does it aim to enhance the performance of a client-based language model?",
    "What are the reasons for using GPT-4 over GPT-3.5?",
    "What are the key challenges in the implementation of RAG systems, and how do the authors of 'THE CHRONICLES OF RAG: THE RETRIEVER, THE CHUNK AND THE GENERATOR' propose to address them, particularly focusing on retriever quality, input size optimization, and overall performance evaluation?",
    "Why is RAG preferred over general LLMs?"
]

ground_truths = [
    ["The research paper 'Benchmarking Large Language Models in Retrieval-Augmented Generation' evaluates the abilities of Large Language Models (LLMs) in utilizing retrieved information for generation tasks. It introduces the Retrieval-Augmented Generation Benchmark (RGB) to assess LLMs' capabilities in noisy environments, negative rejection scenarios, information integration, and counterfactual robustness. The paper aims to comprehensively evaluate LLMs' performance in effectively utilizing external information and overcoming challenges in information retrieval."],
    ["The research paper focuses on implementing generative AI services using the Large Language Models (LLM) application architecture. The main contributions of the paper include addressing information scarcity within LLM by proposing specific remedies using fine-tuning techniques and direct document integration. The significant contribution is the development of the Retrieval-Augmented Generation (RAG) model, which enhances information storage and retrieval processes, leading to improved content generation within enterprises utilizing LLMs. The RAG model combines information retrieval with text generation, improving the accuracy and reliability of generated content, specifically addressing challenges related to data insufficiency within LLMs."],
    ["The paper introduces a comprehensive framework, CRUD-RAG, for evaluating retrieval-augmented generation (RAG) systems in text generation tasks. The framework categorizes tasks into CRUD types (Create, Read, Update, Delete), providing a structured assessment of RAG capabilities. Large-scale datasets for each CRUD category are created to evaluate RAG performance under various conditions. Experimental comparisons demonstrate that RAG systems significantly improve content quality by incorporating information from external sources. The study emphasizes the importance of fine-tuning RAG systems, addressing factors such as retrieval model optimization, context length, knowledge base construction, and large language model deployment. The insights aim to guide researchers and practitioners, offering a roadmap for the development and refinement of RAG systems. The paper envisions stimulating further exploration and innovation in RAG technologies to advance text generation applications and enhance the integration of retrieval mechanisms and language models for more intelligent and context-aware generative systems."],
    ["The proposed approach addresses the computational complexity issue in knowledge selection by introducing the Hierarchical Selection method. This method facilitates the reduction of computational cost by splitting the task of classifying a relevant document into subsequent stages. By first identifying the relevant domain and entity before selecting the relevant snippet among the documents of that entity, the approach is able to significantly reduce the computational cost. In response generation, the approach employs Retrieval Augmented Generation (RAG) to consider multiple knowledge snippets. By using RAG, the model is able to generate responses based on multiple selected snippets, allowing for a more comprehensive and informed generation process. This method incorporates the knowledge from multiple sources to enhance the response generation process. Additionally, the knowledge selection process is optimized through the use of metadata, and two variants are presented: one involving three separate models for domain, entity, and document retrieval, and the other involving joint retrieval of domain and entity. This reduces the computational complexity from O(|K|) to O(|D| + |E|) and O(|E| + |K|) for the first and second variants, respectively. The response generation task is formulated as a marginalization over selected knowledge snippets, enabling the consideration of multiple snippets in generating responses and addressing potential errors introduced by a hard decision in the selection step. The method involves a probability distribution over knowledge snippets given a dialog context and includes both selection and generation probabilities."],
    ["The challenges and potential applications discussed for large language models like GPT include their remarkable natural language processing capabilities in tasks such as automated form completion and knowledge-intensive tasks like open-domain question answering (QA), fact checking, and dialogue systems. In the context of automated form completion, challenges include understanding form layout, guidelines, user intent, and reasoning over data. The proposed method involves creating a knowledge base, augmenting the language model with the knowledge base using retrieval-augmented generation, and using prompt engineering techniques. In knowledge-intensive tasks, the text introduces a generate-then-read (GENREAD) method, replacing document retrievers with large language model generators. Additionally, the Hybrid Retrieval-Augmented Generation (HybridRAG) framework is proposed to efficiently combine a cloud-based LLM with a smaller client-side language model. Another aspect is the introduction of MetaRAG, which combines retrieval-augmented generation with metacognition to enhance introspective reasoning abilities and improve response strategies through a three-step metacognitive regulation pipeline."],
    ["Metacognitive Retrieval-Augmented Generation (MetaRAG) is a framework for enhancing retrieval-augmented generation processes in large language models through the integration of metacognition. MetaRAG combines the capabilities of retrieval systems with metacognitive abilities inspired by human cognitive processes. It allows the model to monitor, evaluate, and plan its response strategies, enabling introspective reasoning and self-awareness in the cognitive process. This approach aims to identify and rectify errors or shortcomings in the model's reasoning, ultimately leading to more accurate and precise answer derivation."],
    ["The HybridRAG framework comprises an augmentation coordinator (client), a memory-augmented client model (client), a retriever model (cloud), and a language model (LLM)-based memory generator (cloud). The augmentation coordinator monitors writing context and requests augmented memory from the cloud based on context changes. The retriever model searches for relevant data in a retrieval corpus using Dense Passage Retrieval. The LLM-based memory generator generates concise key takeaways from retrieved documents, optimizing memory size. The memory is integrated into the client model to enhance its overall performance. The client model is fine-tuned using instruction-based prompts and LLM-generated references to effectively leverage cloud-generated memory."],
    ["Enhanced Performance: GPT-4 surpasses GPT-3.5 in various tasks, particularly excelling in question-answering scenarios that demand reasoning across multiple documents. Advanced Knowledge Integration: GPT-4 exhibits remarkable proficiency in fusing knowledge, especially when a comprehensive understanding of information from diverse documents is imperative. This underscores its superior capability in handling intricate tasks. Efficient Text Refinement: GPT-4 effectively refines and corrects text in scenarios requiring modification, maintaining accuracy and content relevance while addressing errors or hallucinations in the text. Concise Summary Production: GPT-4 generates concise summaries by eliminating redundant information, showcasing its efficiency in creating informative and succinct content. Robust Performance: While GPT-4 stands out as the preferred choice due to its overall outstanding performance, models like Qwen-7B and Qwen-14B also demonstrate competitive results in tasks such as text continuation, summary generation, and question answering."],
    ["The challenges in RAG system implementation include effective integration of retrieval models, representation learning efficiency, diverse data handling, computational efficiency optimization, and quality evaluation. The article proposes best practices for implementing RAG on a Brazilian Portuguese dataset, emphasizing a simplified pipeline. Key findings include a 35.4% improvement in MRR@10 with retriever quality enhancement, a 2.4% performance boost through input size optimization, and a final accuracy of 98.61%, marking a 40.73% improvement compared to the baseline. The study underscores the importance of data quality in input, retriever, and evaluation, providing insights into formulating queries, understanding information retrieval behavior, and defining evaluation metrics. The future work involves exploring additional datasets for segmentation and chunk construction techniques."],
    ["RAG (Retrieval Augmented Generation) is preferred over regular Large Language Models (LLMs) due to its ability to access external data, address training limitations, and mitigate hallucinations by integrating retrieved information. It leverages an effective retriever pipeline for coherent responses, enhancing text generation quality with relevant external information. RAG employs external knowledge sources, frameworks like LlamaIndex and LangChain, and metacognitive abilities to improve response credibility, accuracy, and reasoning over multiple evidence pieces. It offers benefits such as enhanced utility, low latency through HybridRAG, reduced client-to-cloud communication, and efficient memory integration. In form filling, RAG accesses additional information for accurate responses, correcting errors, and conditioning response generation on selected knowledge snippets. It mitigates issues in LLMs, such as factual errors and outdated knowledge, providing a promising solution by integrating external knowledge for more reliable and coherent responses."]
]

### General answer by LLMs

In [21]:
# max_average = 0
max_average = 0.8972509357752552
def find_highest(average_score):
    global max_average
    if average_score > max_average:
        max_average = average_score
        print("This is the new best value!")

In [8]:
def dictionary(result):
    dict_result = dict(result)
    average_score = sum(dict_result.values()) / len(dict_result)
    print(f"The average score is: {average_score}")
    find_highest(average_score)
    return dict_result, average_score

In [10]:
template = """{question}"""
prompt = ChatPromptTemplate.from_template(template)

In [11]:
llm_chain =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm}
)
llm_chain_gpt4 =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm_gpt4}
)

In [12]:
answers_llm = []
contexts_llm = [[""],[""],[""],[""],[""],[""],[""],[""],[""],[""]]

In [13]:
for query in questions:
    response = llm_chain.invoke({"question": query})
    answers_llm.append(response["response"].content)

In [14]:
llm_results = evaluation_llm(questions, answers_llm, contexts_llm, ground_truths)
print(llm_results)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 40/40 [00:14<00:00,  2.71it/s]


{'faithfulness': 1.0000, 'answer_relevancy': 0.7397, 'answer_similarity': 0.9158, 'answer_correctness': 0.4776}


In [37]:
dict_llm_results, avg_llm_results = dictionary(llm_results)

The average score is: 0.7832904655601836
This is the new best value!


In [18]:
answers_llm_gpt4 = []
for query in questions:
    response = llm_chain_gpt4.invoke({"question": query})
    answers_llm_gpt4.append(response["response"].content)
llm_results_gpt4 = evaluation_llm(questions, answers_llm_gpt4, contexts_llm, ground_truths)
print(llm_results_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 40/40 [00:11<00:00,  3.55it/s]


{'faithfulness': 0.9714, 'answer_relevancy': 0.7348, 'answer_similarity': 0.9034, 'answer_correctness': 0.5471}


In [38]:
dict_llm_results_gpt4, avg_llm_results_gpt4 = dictionary(llm_results_gpt4)

The average score is: 0.7891976282160822
This is the new best value!


# RAG

### Naive RAG

In [9]:
def retrieval_chain(prompt, retriever, llm):
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
    )
    return retrieval_augmented_qa_chain

In [10]:
loader = DirectoryLoader('../papers', glob="./*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = CharacterTextSplitter()
chunks = text_splitter.split_documents(documents)
db_naive = Chroma.from_documents(chunks, embeddings_client, persist_directory = "../papers/vectordb/naive")
db_naive.persist()
retriever_naive = db_naive.as_retriever()

In [22]:
db_naive = Chroma(persist_directory = "../papers/vectordb/naive", embedding_function=embeddings_client)
retriever_naive = db_naive.as_retriever()

In [11]:
template = """{question}. 
Provided context {context}."""
prompt = ChatPromptTemplate.from_template(template)

In [24]:
answers_naive = []
contexts_naive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_naive, llm).invoke({"question": query})
        # Access the response content
        answers_naive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_naive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_naive.append("No answer")
        context_full = retriever_naive.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_naive.append(context_content)

In [25]:
result_naive_rag = evaluation_rag(questions, answers_naive, contexts_naive, ground_truths)
print(result_naive_rag)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  20%|██        | 12/60 [00:04<00:20,  2.37it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.60it/s]


{'faithfulness': 0.7146, 'answer_relevancy': 0.9150, 'context_precision': 0.8762, 'context_recall': 0.8806, 'answer_similarity': 0.9322, 'answer_correctness': 0.9078}


In [39]:
dict_result_naive_rag, avg_result_naive_rag = dictionary(result_naive_rag)

The average score is: 0.8710708648925052
This is the new best value!


### recursive splitter

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder()
chunks_r = text_splitter.split_documents(documents)
db_basic = Chroma.from_documents(chunks_r, embeddings_client, persist_directory = "../papers/vectordb/recursive_basic")
db_basic.persist()
retriever_basic = db_basic.as_retriever()

In [12]:
db_basic = Chroma(persist_directory = "../papers/vectordb/recursive_basic", embedding_function=embeddings_client)
retriever_basic = db_basic.as_retriever()

In [28]:
answers_recursive = []
contexts_recursive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_basic, llm).invoke({"question": query})
        # Access the response content
        answers_recursive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_recursive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_recursive.append("No answer")
        context_full = retriever_basic.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_recursive.append(context_content)

In [29]:
result_recursive = evaluation_rag(questions, answers_recursive, contexts_recursive, ground_truths)
print(result_recursive)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  20%|██        | 12/60 [00:05<00:20,  2.35it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating:  80%|████████  | 48/60 [00:06<00:00, 17.46it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.40it/s]


{'faithfulness': 0.7240, 'answer_relevancy': 0.9056, 'context_precision': 0.8975, 'context_recall': 0.9417, 'answer_similarity': 0.9322, 'answer_correctness': 0.9079}


In [40]:
dict_result_recursive, avg_result_recursive= dictionary(result_recursive)

The average score is: 0.8848003610899471
This is the new best value!


### chunk sizes

In [13]:
def run_and_evaluate(retriever, prompt, llm):
    answers_recursive = []
    contexts_recursive = []

    for query in questions:
        try:  
            response = retrieval_chain(prompt, retriever, llm).invoke({"question": query})
            # Access the response content
            answers_recursive.append(response["response"].content)
            # Access the context content
            context_content = [context.page_content for context in response["context"]]
            contexts_recursive.append(context_content)  
        except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_recursive.append("No answer")
            context_full = retriever.get_relevant_documents(query)
            context_content = [context.page_content for context in context_full]
            contexts_recursive.append(context_content)


    result = evaluation_rag(questions, answers_recursive, contexts_recursive, ground_truths)
    return result

chunk = 1000

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 100)
chunks_1000 = text_splitter.split_documents(documents)
print(len(chunks_1000))
db_1000 = Chroma.from_documents(chunks_1000, embeddings_client, persist_directory = "../papers/vectordb/recursive_1000")
db_1000.persist()

In [30]:
db_1000 = Chroma(persist_directory = "../papers/vectordb/recursive_1000", embedding_function=embeddings_client)

In [43]:
retriever_1000 = db_1000.as_retriever()
result_1000 = run_and_evaluate(retriever_1000, prompt, llm)
print("CHUNK SIZE 1000")
print(result_1000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.43it/s]


CHUNK SIZE 1000
{'faithfulness': 0.9444, 'answer_relevancy': 0.7431, 'context_precision': 0.7889, 'context_recall': 0.9000, 'answer_similarity': 0.9445, 'answer_correctness': 0.6325}


In [44]:
dict_1000, avg_1000 = dictionary(result_1000)

The average score is: 0.8255860278791941


chunk = 500

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 500, chunk_overlap = 50)
chunks_500 = text_splitter.split_documents(documents)
print(len(chunks_500))
db_500 = Chroma.from_documents(chunks_500, embeddings_client, persist_directory = "../papers/vectordb/recursive_500")
db_500.persist()

In [45]:
db_500 = Chroma(persist_directory = "../papers/vectordb/recursive_500", embedding_function=embeddings_client)

In [46]:
retriever_500 = db_500.as_retriever()
result_500 = run_and_evaluate(retriever_500, prompt, llm)
print("CHUNK SIZE 500")
print(result_500)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  22%|██▏       | 13/60 [00:07<00:33,  1.41it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.89it/s]


CHUNK SIZE 500
{'faithfulness': 0.9444, 'answer_relevancy': 0.7306, 'context_precision': 0.8389, 'context_recall': 0.8800, 'answer_similarity': 0.9117, 'answer_correctness': 0.6952}


In [47]:
dict_result_500, avg_500 = dictionary(result_500)

The average score is: 0.8334785012329347


chunk = 2000

In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 2000, chunk_overlap = 200)
chunks_2000 = text_splitter.split_documents(documents)
print(len(chunks_2000))
db_2000 = Chroma.from_documents(chunks_2000, embeddings_client, persist_directory = "../papers/vectordb/recursive_2000")


In [48]:
db_2000 = Chroma(persist_directory = "../papers/vectordb/recursive_2000", embedding_function=embeddings_client)

In [49]:
retriever_2000 = db_2000.as_retriever()
result_2000 = run_and_evaluate(retriever_2000, prompt, llm)
print("CHUNK SIZE 2000")
print(result_2000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  32%|███▏      | 19/60 [00:05<00:08,  5.09it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.13it/s]


CHUNK SIZE 2000
{'faithfulness': 0.7052, 'answer_relevancy': 0.9250, 'context_precision': 0.8975, 'context_recall': 0.8972, 'answer_similarity': 0.8522, 'answer_correctness': 0.9046}


In [50]:
dict_result_2000, avg_result_2000 = dictionary(result_2000)

The average score is: 0.8636180248927623


chunk = 3000

In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 3000, chunk_overlap = 300)
chunks_3000 = text_splitter.split_documents(documents)
print(len(chunks_3000))
db_3000 = Chroma.from_documents(chunks_3000, embeddings_client, persist_directory = "../papers/vectordb/recursive_3000")

In [51]:
db_3000 = Chroma(persist_directory = "../papers/vectordb/recursive_3000", embedding_function=embeddings_client)

In [52]:
retriever_3000 = db_3000.as_retriever()
result_3000 = run_and_evaluate(retriever_3000, prompt, llm)
print("CHUNK SIZE 3000")
print(result_3000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  25%|██▌       | 15/60 [00:05<00:14,  3.13it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.03it/s]


CHUNK SIZE 3000
{'faithfulness': 0.6899, 'answer_relevancy': 0.9722, 'context_precision': 0.8957, 'context_recall': 0.9056, 'answer_similarity': 0.8322, 'answer_correctness': 0.9028}


In [53]:
dict_result_3000, avg_result_3000 = dictionary(result_3000)

The average score is: 0.8663851352441997


### now time to look for different top-k

Note: We continue with the original chunk size as it had the highest average score

In [19]:
retriever_3 = db_basic.as_retriever(search_kwargs={"k": 3})

In [54]:
retriever_3 = db_basic.as_retriever(search_kwargs={"k": 3})
result_3 = run_and_evaluate(retriever_3, prompt, llm)
print("CHUNK SIZE 1000, K=3")
print(result_3)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.86it/s]


CHUNK SIZE 1000, K=3
{'faithfulness': 0.9464, 'answer_relevancy': 0.8815, 'context_precision': 0.9500, 'context_recall': 0.9200, 'answer_similarity': 0.9495, 'answer_correctness': 0.7360}


In [55]:
dict_result_3, avg_result_3 = dictionary(result_3)

The average score is: 0.8972509357752552
This is the new best value!


In [56]:
retriever_5 = db_basic.as_retriever(search_kwargs={"k": 5})
result_5 = run_and_evaluate(retriever_5, prompt, llm)
print("CHUNK SIZE 1000, K=5")
print(result_5)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.25it/s]


CHUNK SIZE 1000, K=5
{'faithfulness': 0.9667, 'answer_relevancy': 0.8383, 'context_precision': 0.9037, 'context_recall': 1.0000, 'answer_similarity': 0.9503, 'answer_correctness': 0.6673}


In [57]:
dict_result_5, avg_result_5 = dictionary(result_5)

The average score is: 0.8877150388594726


In [58]:
retriever_6= db_basic.as_retriever(search_kwargs={"k": 6})
result_6 = run_and_evaluate(retriever_6, prompt, llm)
print("CHUNK SIZE 1000, K=5")
print(result_6)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 8813 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
Evaluating:   2%|▏         | 1/60 [00:00<00:14,  4.13it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 10463 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_e

CHUNK SIZE 1000, K=5
{'faithfulness': 0.8583, 'answer_relevancy': 0.8712, 'context_precision': 0.8252, 'context_recall': 1.0000, 'answer_similarity': 0.9007, 'answer_correctness': 0.7116}


In [59]:
dict_result_6, avg_result_6 = dictionary(result_6)

The average score is: 0.8611800327652616


In [60]:
retriever_7= db_basic.as_retriever(search_kwargs={"k": 7})
result_7 = run_and_evaluate(retriever_7, prompt, llm)
print("CHUNK SIZE 1000, K=7")
print(result_7)
dict_result_7, avg_result_7 = dictionary(result_7)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 9629 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
Evaluating:   2%|▏         | 1/60 [00:00<00:09,  6.55it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 8434 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_ex

CHUNK SIZE 1000, K=7
{'faithfulness': 0.9206, 'answer_relevancy': 0.7501, 'context_precision': 0.9289, 'context_recall': 0.9192, 'answer_similarity': 0.8812, 'answer_correctness': 0.6395}
The average score is: 0.8399161234829693


### look for different retrievers

3 chunks was the best score

#### parent document retriever

In [61]:
parent_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

# vectorstore = Chroma(collection_name="split_parents", persist_directory = "../papers/vectordb/parent_summary", embedding_function=embeddings_client)
# vectorstore.persist()
vectorstore = Chroma(persist_directory = "../papers/vectordb/parent_summary", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
    search_kwargs={"k": 3}
)
parent_document_retriever.add_documents(documents)
result_parent = run_and_evaluate(parent_document_retriever, prompt, llm)
print(result_parent)
dict_result_parent, avg_result_parent = dictionary(result_parent)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   2%|▏         | 1/60 [00:02<02:16,  2.32s/it]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating:  30%|███       | 18/60 [00:18<00:43,  1.03s/it]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:18<00:00,  3.20it/s]


{'faithfulness': 0.8855, 'answer_relevancy': 0.7696, 'context_precision': 0.8861, 'context_recall': 0.8000, 'answer_similarity': 0.9503, 'answer_correctness': 0.7148}
The average score is: 0.834366518871876


In [62]:
parent_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap = 50)
child_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap = 0)

# vectorstore = Chroma(collection_name="split_parents_small",persist_directory = "../papers/vectordb/parent_small_summary", embedding_function=embeddings_client)
# vectorstore.persist()
vectorstore = Chroma(persist_directory = "../papers/vectordb/parent_small_summary", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_small = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_small,
    parent_splitter=parent_splitter_small,
    search_kwargs={"k": 3}
)
parent_document_retriever_small.add_documents(documents)
result_parent_small = run_and_evaluate(parent_document_retriever_small, prompt, llm)
print(result_parent_small)
dict_result_parent_small, avg_result_parent_small = dictionary(result_parent_small)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.92it/s]


{'faithfulness': 0.9000, 'answer_relevancy': 0.6453, 'context_precision': 0.9000, 'context_recall': 0.7800, 'answer_similarity': 0.9186, 'answer_correctness': 0.6382}
The average score is: 0.7970263283111394


In [63]:
parent_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=3000)
child_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=300, chunk_overlap = 0)

# vectorstore = Chroma(collection_name="split_parents_large", persist_directory = "../papers/vectordb/parent_large_summary", embedding_function=embeddings_client)
# vectorstore.persist()
vectorstore = Chroma(persist_directory = "../papers/vectordb/parent_large_summary", embedding_function=embeddings_client)

store = InMemoryStore()
parent_document_retriever_large = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_large,
    parent_splitter=parent_splitter_large,
    search_kwargs={"k": 3}
)
parent_document_retriever_large.add_documents(documents)
result_parent_large = run_and_evaluate(parent_document_retriever_large, prompt, llm)
print(result_parent_large)
dict_result_parent_large, avg_result_parent_large= dictionary(result_parent_large)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  22%|██▏       | 13/60 [00:02<00:05,  7.86it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating:  55%|█████▌    | 33/60 [00:04<00:02, 12.44it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  6.68it/s]


{'faithfulness': 0.8792, 'answer_relevancy': 0.8292, 'context_precision': 0.9861, 'context_recall': 0.8000, 'answer_similarity': 0.8545, 'answer_correctness': 0.7295}
The average score is: 0.8464055108750075


#### Maximum marginal relevance retrieval

In [64]:
retriever_mmr = db_basic.as_retriever(search_type="mmr",search_kwargs={"k": 3})
result_mmr = run_and_evaluate(retriever_mmr, prompt, llm)
print("Marginal relevance")
print(result_mmr)
dict_result_mmr ,average_score_mmr = dictionary(result_mmr)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.61it/s]


Marginal relevance
{'faithfulness': 0.9417, 'answer_relevancy': 0.8345, 'context_precision': 0.8667, 'context_recall': 0.9000, 'answer_similarity': 0.9478, 'answer_correctness': 0.6859}
The average score is: 0.8627623175021711


#### BM25

In [65]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder()
chunks_bm25 = text_splitter.split_documents(documents)

In [66]:
retriever_bm25 = BM25Retriever.from_documents(chunks_bm25, search_kwargs={"k": 3})
result_bm25 = run_and_evaluate(retriever_bm25, prompt, llm)
print("BM25")
print(result_bm25)
dict_result_bm25 ,average_score_bm25 = dictionary(result_bm25)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.53it/s]


BM25
{'faithfulness': 1.0000, 'answer_relevancy': 0.8347, 'context_precision': 0.6833, 'context_recall': 1.0000, 'answer_similarity': 0.9546, 'answer_correctness': 0.7388}
The average score is: 0.8685838881591635


In [71]:
# try with chunk = 3000
text_splitter_3000 = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 3000, chunk_overlap = 300)
chunks_bm25_3000 = text_splitter_3000.split_documents(documents)
retriever_bm25_3000 = BM25Retriever.from_documents(chunks_bm25_3000, search_kwargs={"k": 3})
result_bm25_3000 = run_and_evaluate(retriever_bm25_3000, prompt, llm)
print("BM25")
print(result_bm25_3000)
dict_result_bm25_3000 ,average_score_bm25_3000 = dictionary(result_bm25_3000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  22%|██▏       | 13/60 [00:08<00:28,  1.63it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.99it/s]


BM25
{'faithfulness': 0.8127, 'answer_relevancy': 0.9282, 'context_precision': 0.6755, 'context_recall': 0.9500, 'answer_similarity': 0.9788, 'answer_correctness': 0.7929}
The average score is: 0.8685838881591635


In [72]:
dict_result_bm25_3000 ,average_score_bm25_3000 = dictionary(result_bm25_3000)

The average score is: 0.8563471513564703


In [29]:
text_splitter_1000 = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 100)
chunks_bm25_1000 = text_splitter_1000.split_documents(documents)
retriever_bm25_1000 = BM25Retriever.from_documents(chunks_bm25_1000)

In [74]:
# try with chunk = 1000
text_splitter_1000 = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 100)
chunks_bm25_1000 = text_splitter_1000.split_documents(documents)
retriever_bm25_1000 = BM25Retriever.from_documents(chunks_bm25_1000)
result_bm25_1000 = run_and_evaluate(retriever_bm25_1000, prompt, llm)
print("BM25")
print(result_bm25_1000)
dict_result_bm25_1000 ,average_score_bm25_1000 = dictionary(result_bm25_1000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.01it/s]


BM25
{'faithfulness': 0.9630, 'answer_relevancy': 0.8946, 'context_precision': 0.7000, 'context_recall': 1.0000, 'answer_similarity': 0.9489, 'answer_correctness': 0.6975}
The average score is: 0.8673045523050709


#### Ensambler - Hybrid

In [67]:
ensemble_retriever_1 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_3], weights=[0.75, 0.25])
result_ensemble1 = run_and_evaluate(ensemble_retriever_1, prompt, llm)
print("Ensambler 75/25")
print(result_ensemble1)
dict_result_ensemble1, average_score_ensambler1 = dictionary(result_ensemble1)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 9837 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
Evaluating:   2%|▏         | 1/60 [00:00<00:12,  4.58it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 8684 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_ex

Ensambler 75/25
{'faithfulness': 0.9334, 'answer_relevancy': 0.8602, 'context_precision': 0.6201, 'context_recall': 0.9326, 'answer_similarity': 0.8021, 'answer_correctness': 0.7320}
The average score is: 0.8134205752119632


In [68]:
ensemble_retriever_2 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_3], weights=[0.5, 0.5])
result_ensemble2 = run_and_evaluate(ensemble_retriever_2, prompt, llm)
print("Ensambler 50/50")
print(result_ensemble2)
dict_result_ensemble2, avg_result_ensemble2 = dictionary(result_ensemble2)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 9836 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
Evaluating:   2%|▏         | 1/60 [00:00<00:16,  3.65it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 8684 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_ex

Ensambler 50/50
{'faithfulness': 0.9074, 'answer_relevancy': 0.7933, 'context_precision': 0.9169, 'context_recall': 0.8708, 'answer_similarity': 0.8751, 'answer_correctness': 0.6210}
The average score is: 0.8307459953221535


In [70]:
ensemble_retriever_3 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_3], weights=[0.25,0.75], search_kwargs = {"k": 3})
result_ensemble3 = run_and_evaluate(ensemble_retriever_3, prompt, llm)
print("Ensambler 25/75")
print(result_ensemble3)
dict_result_ensemble3, avg_result_ensemble3 = dictionary(result_ensemble3)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 9836 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
Evaluating:   2%|▏         | 1/60 [00:00<00:57,  1.02it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 8684 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_ex

Ensambler 25/75
{'faithfulness': 0.8903, 'answer_relevancy': 0.8441, 'context_precision': 0.7885, 'context_recall': 0.9612, 'answer_similarity': 0.7973, 'answer_correctness': 0.7086}
The average score is: 0.8316611535404289


In [75]:
ensemble_retriever_4 = EnsembleRetriever(retrievers=[retriever_bm25_1000, retriever_1000], weights=[0.25,0.75])
result_ensemble4 = run_and_evaluate(ensemble_retriever_4, prompt, llm)
print("Ensambler 25/75")
print(result_ensemble4)
dict_result_ensemble4, avg_result_ensemble4 = dictionary(result_ensemble4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   2%|▏         | 1/60 [00:06<06:52,  6.98s/it]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.66it/s]


Ensambler 25/75
{'faithfulness': 0.9667, 'answer_relevancy': 0.8221, 'context_precision': 0.7231, 'context_recall': 1.0000, 'answer_similarity': 0.9493, 'answer_correctness': 0.6830}
The average score is: 0.8573750593447539


In [76]:
ensemble_retriever_5 = EnsembleRetriever(retrievers=[retriever_bm25_1000, retriever_1000], weights=[0.5,0.5])
result_ensemble5 = run_and_evaluate(ensemble_retriever_5, prompt, llm)
print("Ensambler 50/50")
print(result_ensemble5)
dict_result_ensemble5, avg_result_ensemble5 = dictionary(result_ensemble5)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:20<00:00,  2.94it/s]


Ensambler 50/50
{'faithfulness': 0.9563, 'answer_relevancy': 0.8236, 'context_precision': 0.7297, 'context_recall': 0.9714, 'answer_similarity': 0.9455, 'answer_correctness': 0.6858}
The average score is: 0.8520531026819717


In [77]:
ensemble_retriever_6 = EnsembleRetriever(retrievers=[retriever_bm25_1000, retriever_1000], weights=[0.75,0.25])
result_ensemble6 = run_and_evaluate(ensemble_retriever_6, prompt, llm)
print("Ensambler 75/25")
print(result_ensemble6)
dict_result_ensemble6, avg_result_ensemble6 = dictionary(result_ensemble6)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  23%|██▎       | 14/60 [00:06<00:23,  1.98it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:13<00:00,  4.34it/s]


Ensambler 75/25
{'faithfulness': 0.9722, 'answer_relevancy': 0.8195, 'context_precision': 0.6898, 'context_recall': 1.0000, 'answer_similarity': 0.9344, 'answer_correctness': 0.6375}
The average score is: 0.8422365764659673


#### Multi-stage - reranker

In [27]:
os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")

In [28]:
retriever_context = retriever_basic
compressor = CohereRerank(top_n=3)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever_context
)

result_compression = run_and_evaluate(compression_retriever, prompt, llm)
print("Reranker")
print(result_compression)
dict_result_compression,avg_result_compression = dictionary(result_compression)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.02it/s]


Reranker
{'faithfulness': 0.9500, 'answer_relevancy': 0.9146, 'context_precision': 0.7833, 'context_recall': 0.9800, 'answer_similarity': 0.9479, 'answer_correctness': 0.7198}
The average score is: 0.8825883617554365


In [None]:
dict_result_compression,avg_result_compression = dictionary(result_compression)

#### creating context by remaking the query

In [14]:
template_context = "Generate a search query to fetch the relevant documents using the user's {question}. Craft a query that specifically targets the keywords in the question. In the answer provide only the query."
prompt_context = ChatPromptTemplate.from_template(template_context)

In [23]:
answers_final = []
contexts_final = []
llm_for_context =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt_context | llm}
)
for query in questions:
    response_check = llm_for_context.invoke({"question": query})
    search_query = response_check["response"].content
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("context"), "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
)
    docs = retriever_5.get_relevant_documents(search_query)
    formatted_docs = []
    for doc in docs:
        resulting_doc = doc.page_content
        formatted_docs.append(resulting_doc)
    try:  
            response = retrieval_augmented_qa_chain.invoke({"context": formatted_docs, "question": query})
            # Access the response content
            answers_final.append(response["response"].content)
            contexts_final.append(formatted_docs)  
    except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_final.append("No answer")
            contexts_final.append(formatted_docs)


result_search_query = evaluation_rag(questions, answers_final, contexts_final, ground_truths)
print(result_search_query)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:11<00:00,  5.32it/s]


{'faithfulness': 0.9750, 'answer_relevancy': 0.8835, 'context_precision': 0.8875, 'context_recall': 0.8750, 'answer_similarity': 0.9469, 'answer_correctness': 0.7101}


In [24]:
dict_result_search_query, avg_result_search_query = dictionary(result_search_query)

The average score is: 0.8796665823560407


### change model to GPT-4

In [20]:
result_3_gpt4 = run_and_evaluate(retriever_3, prompt, llm_gpt4)
print("top k = 3")
print(result_3_gpt4)
dict_result_3_gpt4, avg_result_3_gpt4 = dictionary(result_3_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  20%|██        | 12/60 [00:03<00:10,  4.80it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:13<00:00,  4.44it/s]


Marginal relevance
{'faithfulness': 0.9198, 'answer_relevancy': 0.8796, 'context_precision': 0.8083, 'context_recall': 1.0000, 'answer_similarity': 0.9425, 'answer_correctness': 0.7330}
The average score is: 0.8805283590889733
This is the new best value!


In [22]:
result_basic_gpt4 = run_and_evaluate(retriever_basic, prompt, llm_gpt4)
print(result_basic_gpt4)
dict_result_basic_gpt4, avg_result_basic_gpt4 = dictionary(result_basic_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  25%|██▌       | 15/60 [00:05<00:14,  3.05it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.76it/s]


{'faithfulness': 0.8602, 'answer_relevancy': 0.8736, 'context_precision': 0.9541, 'context_recall': 0.7959, 'answer_similarity': 0.8613, 'answer_correctness': 0.9318}
The average score is: 0.8795001256924517


In [25]:
result_5_gpt4 = run_and_evaluate(retriever_5, prompt, llm_gpt4)
print("top k = 5")
print(result_5_gpt4)
dict_result_5_gpt4, avg_result_5_gpt4 = dictionary(result_5_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:11<00:00,  5.26it/s]


top k = 5
{'faithfulness': 0.9889, 'answer_relevancy': 0.8411, 'context_precision': 0.7717, 'context_recall': 1.0000, 'answer_similarity': 0.9410, 'answer_correctness': 0.7420}
The average score is: 0.8807767174867084


In [32]:
result_ensambler5_gpt4 = run_and_evaluate(ensemble_retriever_5, prompt, llm_gpt4)
print("Ensambler 5")
print(result_ensambler5_gpt4)
dict_result_ensambler5_gpt4, avg_result_ensambler5_gpt4 = dictionary(result_ensambler5_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:11<00:00,  5.08it/s]


Ensambler 5
{'faithfulness': 1.0000, 'answer_relevancy': 0.8443, 'context_precision': 0.7201, 'context_recall': 0.9000, 'answer_similarity': 0.9560, 'answer_correctness': 0.7225}
The average score is: 0.8571456495259143


In [33]:
result_compression_gpt4 = run_and_evaluate(compression_retriever, prompt, llm_gpt4)
print("Reranker")
print(result_compression_gpt4)
dict_result_compression_gpt4,avg_result_compression_gpt4 = dictionary(result_compression_gpt4)