In [1]:
import dotenv
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain_openai import AzureOpenAIEmbeddings
dotenv.load_dotenv()
from langchain_community.vectorstores import Chroma
from ragas import evaluate
from ragas.metrics import (
    context_recall,
    context_precision,
)
from datasets import Dataset
from langchain_openai import AzureChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
import os
import sys
sys.tracebacklimit = 0
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
import getpass
from langchain.retrievers.document_compressors import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.document_loaders import PyPDFLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION")
AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
OPENAI_MODEL = os.environ.get("OPENAI_MODEL")
OPENAI_DEPLOYMENT = os.environ.get("OPENAI_DEPLOYMENT")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL")
EMBEDDING_DEPLOYMENT = os.environ.get("EMBEDDING_DEPLOYMENT")
OPENAI_MODEL_GPT4 = os.environ.get("OPENAI_MODEL_GPT4")
OPENAI_DEPLOYMENT_GPT4 = os.environ.get("OPENAI_DEPLOYMENT_GPT4")

In [23]:
questions = [
    "In CRUD, the letter 'C' stands for create, 'R' stands for read, 'U' stands for upload and 'D' stands for delete.",
    "The results of the experiment show that the version of the Forms Copilot that leverages all data sources and responses, uses chain of thought prompting, asks the model to cite the sources, and provides a description of the form has achieved an average of 4.89 / 5 G-EVAL score and 0.71 F1 BERTscore, indicating that the suggestions are similar to the ground truth.",
    "Model Layer: This particular layer is instrumental in powering AI products delivered via either proprietary APIs or open-source checkpoints (which necessitate hosting solutions). Within the Foundation Models category, there exist both proprietary models from private sources (such as chatGPT) and open-source models (like Stable Diffusion). Additionally, there are model hubs dedicated to the sharing and hosting of Foundation Models.",
    "After contextual documents are retrieved or generated, we employ a FiD reader with 770M parameter models (i.e., FiD-l) and 8B parameter models (i.e., FiD-xl) that are fine-tuned on the training split of target datasets.",
    "Pessimistic Rejection serves as an evaluation metric to assess the ability of Language Models (LLMs) to refuse answering a question in instances where none of the provided contexts offer pertinent information. In practical scenarios, search engines frequently encounter difficulties in retrieving documents that contain the required answers. In such situations, it becomes crucial for the model to possess the capability to decline recognition, thereby preventing the generation of inaccurate or misleading content.",
    "This metacognition space primarily encompasses three main phases: (1) Deduction; (2) Evaluating; (3) Planning.",
    "Response Generation Finally, for response generation, Kim et al. suggest to use an autoregressive sequence generation model conditioned on the dialog context and the knowledge snippet selected in the previous subtask. Kim et al. propose to fine-tune a pre-trained GPT-4 model (Radford et al. 2019) for this task.",
    "The HybridRAG framework is composed of four primary components: an augmentation coordinator (client), a client-side memory-augmented model (client), a retriever model (on-prem), and a memory generator based on Large Language Models (LLM) (cloud).",
    "A general query is characterized by the necessity to retrieve and reason over multiple pieces of supporting evidence in order to furnish an answer. To put it differently, for a given multi-hop query denoted as q, the components within the retrieval set Rq collaboratively supply an answer to q.",
    "Examples of retrieved documents Table 3 shows an example of the REALM masked language model prediction. In this example, “Fermat” is the correct word, and REALITY (row (c)) gives the word a much high probability compared to the BERT model (row (a)). Since REALM manages to retrieve some documents with a related fact (row (b)), the marginalized probability of the correct answer dramatically increases. This shows that REALM is able to retrieve document to fill in the masked word even though it is trained with unsupervised text only."
]

ground_truths = [
    ["In CRUD, the letter 'C' stands for create, 'R' stands for read, 'U' stands for update and 'D' stands for delete."],
    ["The results of the experiment show that the version of the Forms Copilot that leverages all data sources and responses, uses chain of thought prompting, asks the model to cite the sources, and provides a description of the form has achieved an average of 2.41 / 5 G-EVAL score and 0.71 F1 BERTscore, indicating that the suggestions are similar to the ground truth."],
    ["Model Layer: This particular layer is instrumental in powering AI products delivered via either proprietary APIs or open-source checkpoints (which necessitate hosting solutions). Within the Foundation Models category, there exist both proprietary models from private sources (such as GPT-4) and open-source models (like Stable Diffusion). Additionally, there are model hubs dedicated to the sharing and hosting of Foundation Models."],
    ["After contextual documents are retrieved or generated, we employ a FiD reader with 770M parameter models (i.e., FiD-l) and 3B parameter models (i.e., FiD-xl) that are fine-tuned on the training split of target datasets."],
    ["Negative Rejection serves as an evaluation metric to assess the ability of Language Models (LLMs) to refuse answering a question in instances where none of the provided contexts offer pertinent information. In practical scenarios, search engines frequently encounter difficulties in retrieving documents that contain the required answers. In such situations, it becomes crucial for the model to possess the capability to decline recognition, thereby preventing the generation of inaccurate or misleading content."],
    ["This metacognition space primarily encompasses three main phases: (1) Monitoring; (2) Evaluating; (3) Planning."],
    ["Response Generation Finally, for response generation, Kim et al. suggest to use an autoregressive sequence generation model conditioned on the dialog context and the knowledge snippet selected in the previous subtask. Kim et al. propose to fine-tune a pre-trained GPT-2 model (Radford et al. 2019) for this task."],
    ["The HybridRAG framework is composed of four primary components: an augmentation coordinator (client), a client-side memory-augmented model (client), a retriever model (cloud), and a memory generator based on Large Language Models (LLM) (cloud)."],
    ["A multi-hop query is characterized by the necessity to retrieve and reason over multiple pieces of supporting evidence in order to furnish an answer. To put it differently, for a given multi-hop query denoted as q, the components within the retrieval set Rq collaboratively supply an answer to q."],
    ["Examples of retrieved documents Table 3 shows an example of the REALM masked language model prediction. In this example, “Fermat” is the correct word, and REALM (row (c)) gives the word a much high probability compared to the BERT model (row (a)). Since REALM manages to retrieve some documents with a related fact (row (b)), the marginalized probability of the correct answer dramatically increases. This shows that REALM is able to retrieve document to fill in the masked word even though it is trained with unsupervised text only."]
]


In [4]:
embeddings_client = AzureOpenAIEmbeddings(
    azure_deployment=EMBEDDING_DEPLOYMENT,
    openai_api_version=OPENAI_API_VERSION)
llm = AzureChatOpenAI(model_name=OPENAI_MODEL, azure_deployment=OPENAI_DEPLOYMENT,temperature=0)
llm_gpt4 = AzureChatOpenAI(model_name=OPENAI_MODEL_GPT4, azure_deployment=OPENAI_DEPLOYMENT_GPT4,temperature=0)

In [5]:
def evaluation_rag(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            context_precision,
            context_recall,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
        raise_exceptions=False,
    )
    return result

In [6]:
def flatten_list(lst):
    return [item.replace('"', '').replace("'", '').replace('“','').replace('”', '').replace('\n','') for sublist in lst for item in (sublist if isinstance(sublist, list) else [sublist])]

def calculate_accuracy(answers, ground_truths):
    # Convert both answers and ground truths to lowercase for case-insensitive comparison
    answers_lower = flatten_list([answer.lower() for answer in answers])
    ground_truths_lower = flatten_list([[truth.lower() for truth in sublist] for sublist in ground_truths])
    # Check if each answer corresponds with the ground truth exactly
    accuracy = [1 if ans == truth else 0 for ans, truth in zip(answers_lower, ground_truths_lower)]
    # Calculate the average accuracy
    average_accuracy = sum(accuracy) / len(accuracy)
    return average_accuracy

### General answers by LLMs

In [7]:
template = """ Fix an error in the following query. The answer should be exactly the same as query but with corrected mistake. For example:
'input': 'There are 10 continents in the world.',
'output': 'There are 7 continents in the world.',
'input': 'The elephant is the largest animal of all time.',
'output': 'The blue whale is the largest animal of all time.'.
This is the input query: {question}"""
prompt = ChatPromptTemplate.from_template(template)

In [8]:
llm_chain =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm}
)
llm_chain_gpt4 =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm_gpt4}
)

In [9]:
answers_llm = []
contexts_llm = [[""],[""],[""],[""],[""],[""],[""],[""],[""],[""]]

In [10]:
for query in questions:
    response = llm_chain.invoke({"question": query})
    answers_llm.append(response["response"].content)

In [11]:
answers_llm

['There is no error in the query.',
 'There is no error in the given query.',
 'There are no errors in the query.',
 'There is no error in the query.',
 'There is no error in the given query.',
 'There is no error in the query.',
 'There is no error in the query.',
 'There is no error in the query.',
 'There is no error in the query.',
 'There is no error in the given query.']

In [12]:
llm_results = calculate_accuracy(answers_llm, ground_truths)
print(llm_results)

0.0


In [13]:
answers_llm_gpt4 = []
for query in questions:
    response = llm_chain_gpt4.invoke({"question": query})
    answers_llm_gpt4.append(response["response"].content)
llm_results_gpt4 = calculate_accuracy(answers_llm_gpt4, ground_truths)
print(llm_results_gpt4)

0.4


In [14]:
answers_llm_gpt4

["In CRUD, the letter 'C' stands for create, 'R' stands for read, 'U' stands for update and 'D' stands for delete.",
 'The results of the experiment show that the version of the Forms Copilot that leverages all data sources and responses, uses chain of thought prompting, asks the model to cite the sources, and provides a description of the form has achieved an average of 4.89 / 5 G-EVAL score and 0.71 F1 BERTscore, indicating that the suggestions are similar to the ground truth.',
 'This particular layer is instrumental in powering AI products delivered via either proprietary APIs or open-source checkpoints (which necessitate hosting solutions). Within the Foundation Models category, there exist both proprietary models from private sources (such as GPT-3) and open-source models (like BERT). Additionally, there are model hubs dedicated to the sharing and hosting of Foundation Models.',
 'After contextual documents are retrieved or generated, we employ a FiD reader with 770M parameter mo

### Naive RAG

In [15]:
def retrieval_chain(prompt, retriever, llm):
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
    )
    return retrieval_augmented_qa_chain

In [16]:
loader = DirectoryLoader('../papers', glob="./*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [None]:
text_splitter = CharacterTextSplitter()
chunks = text_splitter.split_documents(documents)
db_naive = Chroma.from_documents(chunks, embeddings_client, persist_directory = "../papers/vectordb/naive")
db_naive.persist()
retriever_naive = db_naive.as_retriever()

In [17]:
db_naive = Chroma(persist_directory = "../papers/vectordb/naive", embedding_function=embeddings_client)
retriever_naive = db_naive.as_retriever()

In [18]:
template = """Fix an error in the following query. The answer should be exactly the same as query but with corrected mistake. For example:
'input': 'There are 10 continents in the world.',
'output': 'There are 7 continents in the world.',
'input': 'The elephant is the largest animal of all time.',
'output': 'The blue whale is the largest animal of all time.'.
This is the input query: {question}. 
Here is some provided context {context}."""
prompt = ChatPromptTemplate.from_template(template)

In [19]:
answers_naive = []
contexts_naive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_naive, llm).invoke({"question": query})
        # Access the response content
        answers_naive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_naive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_naive.append("No answer")
        context_full = retriever_naive.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_naive.append(context_content)

In [20]:
answers_naive

['There is no error in the given query.',
 'The results of the experiment show that the version of the Forms Copilot that leverages all data sources and responses, uses chain of thought prompting, asks the model to cite the sources, and provides a description of the form has achieved an average of 2.41 / 5 G-EVAL score and 0.71 F1 BERTscore, indicating that the suggestions are similar to the ground truth. \n\nThe corrected query is: The results of the experiment show that the version of the Forms Copilot that leverages all data sources and responses, uses chain of thought prompting, asks the model to cite the sources, and provides a description of the form has achieved an average of 4.89 / 5 G-EVAL score and 0.71 F1 BERTscore, indicating that the suggestions are similar to the ground truth.',
 'There is no error in the given query.',
 'After contextual documents are retrieved or generated, we employ a FiD reader with 770M parameter models (i.e., FiD-l) and 8B parameter models (i.e., Fi

In [21]:
result_naive_rag_context = evaluation_rag(questions, answers_naive, contexts_naive, ground_truths)
print(result_naive_rag_context)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:03<00:00,  5.70it/s]


{'context_precision': 0.8333, 'context_recall': 1.0000}


In [22]:
result_naive_rag = calculate_accuracy(answers_naive, ground_truths)
print(result_naive_rag)

0.0


### recursive splitter

In [None]:
text_splitter = text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder()
chunks_r = text_splitter.split_documents(documents)
db_basic = Chroma.from_documents(chunks_r, embeddings_client, persist_directory = "../papers/vectordb/recursive_basic")
db_basic.persist()
retriever_basic = db_basic.as_retriever()

In [25]:
db_basic = Chroma(persist_directory = "../papers/vectordb/recursive_basic", embedding_function=embeddings_client)
retriever_basic = db_basic.as_retriever()

In [26]:
answers_recursive = []
contexts_recursive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_basic, llm).invoke({"question": query})
        # Access the response content
        answers_recursive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_recursive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_recursive.append("No answer")
        context_full = retriever_basic.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_recursive.append(context_content)
answers_recursive

['There is no error in the given query.',
 'The results of the experiment show that the version of the Forms Copilot that leverages all data sources and responses, uses chain of thought prompting, asks the model to cite the sources, and provides a description of the form has achieved an average of 2.41 / 5 G-EVAL score and 0.71 F1 BERTscore, indicating that the suggestions are similar to the ground truth. \n\nThe corrected query is: The results of the experiment show that the version of the Forms Copilot that leverages all data sources and responses, uses chain of thought prompting, asks the model to cite the sources, and provides a description of the form has achieved an average of 4.89 / 5 G-EVAL score and 0.71 F1 BERTscore, indicating that the suggestions are similar to the ground truth.',
 'There is no error in the given query.',
 'After contextual documents are retrieved or generated, we employ a FiD reader with 770M parameter models (i.e., FiD-l) and 8B parameter models (i.e., Fi

In [27]:
result_recursive_context = evaluation_rag(questions, answers_recursive, contexts_recursive, ground_truths)
print(result_recursive_context)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:03<00:00,  5.01it/s]


{'context_precision': 0.8000, 'context_recall': 1.0000}


In [28]:
result_recursive = calculate_accuracy(answers_recursive, ground_truths)
print(result_recursive)

0.0


### chunk sizes

In [29]:
def run_and_evaluate(retriever, prompt, llm):
    answers_recursive = []
    contexts_recursive = []

    for query in questions:
        try:  
            response = retrieval_chain(prompt, retriever, llm).invoke({"question": query})
            # Access the response content
            answers_recursive.append(response["response"].content)
            # Access the context content
            context_content = [context.page_content for context in response["context"]]
            contexts_recursive.append(context_content)  
        except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_recursive.append("No answer")
            context_full = retriever.get_relevant_documents(query)
            context_content = [context.page_content for context in context_full]
            contexts_recursive.append(context_content)


    result = evaluation_rag(questions, answers_recursive, contexts_recursive, ground_truths)
    accuracy = calculate_accuracy(answers_recursive, ground_truths)
    print(answers_recursive)
    return result, accuracy

In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 100)
chunks_1000 = text_splitter.split_documents(documents)
print(len(chunks_1000))
db_1000 = Chroma.from_documents(chunks_1000, embeddings_client, persist_directory = "../papers/vectordb/recursive_1000")
db_1000.persist()
retriever_1000 = db_1000.as_retriever()
result_1000_context = run_and_evaluate(retriever_1000, prompt, llm)
print("CHUNK SIZE 1000")
print(result_1000_context)

In [30]:
db_1000 = Chroma(persist_directory = "../papers/vectordb/recursive_1000", embedding_function=embeddings_client)

In [31]:
retriever_1000 = db_1000.as_retriever()
result_1000_context, result_1000 = run_and_evaluate(retriever_1000, prompt, llm)
print("CHUNK SIZE 1000")
print(result_1000_context)
print("AVG accuracy "+ str(result_1000))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:03<00:00,  6.58it/s]


['There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'This metacognition space primarily encompasses three main phases: (1) Monitoring; (2) Evaluating; (3) Planning.', 'There is no error in the given query.', 'The input query is already correct. No error needs to be fixed.', 'No error found in the query.', 'No error found.']
CHUNK SIZE 1000
{'context_precision': 0.8139, 'context_recall': 0.9500}
AVG accuracy 0.1


In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 500, chunk_overlap = 50)
chunks_500 = text_splitter.split_documents(documents)
print(len(chunks_500))
db_500 = Chroma.from_documents(chunks_500, embeddings_client, persist_directory = "../vectordb/recursive_500")
db_500.persist()
retriever_500 = db_500.as_retriever()
result_500 = run_and_evaluate(retriever_500, prompt, llm)
print("CHUNK SIZE 500")
print(result_500)

In [32]:
db_500 = Chroma(persist_directory = "../papers/vectordb/recursive_500", embedding_function=embeddings_client)

In [33]:
retriever_500 = db_500.as_retriever()
result_500, acc_500 = run_and_evaluate(retriever_500, prompt, llm)
print("CHUNK SIZE 500")
print(result_500)
print("AVG accuracy "+ str(acc_500))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:03<00:00,  6.43it/s]


['There is no error in the given query.', 'The error in the query is that it states "an average of 4.89 / 5 G-EVAL score" instead of "an average of 2.41 / 5 G-EVAL score". The corrected query is:\n\nThe results of the experiment show that the version of the Forms Copilot that leverages all data sources and responses, uses chain of thought prompting, asks the model to cite the sources, and provides a description of the form has achieved an average of 2.41 / 5 G-EVAL score and 0.71 F1 BERTscore, indicating that the suggestions are similar to the ground truth.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'No error found.']
CHUNK SIZE 500
{'context_precision': 0.8417, 'context_recall': 1.0000}
AVG accuracy 0.0


In [34]:
db_2000 = Chroma(persist_directory = "../papers/vectordb/recursive_2000", embedding_function=embeddings_client)

In [35]:
retriever_2000 = db_2000.as_retriever()
result_2000, acc_2000 = run_and_evaluate(retriever_2000, prompt, llm)
print("CHUNK SIZE 2000")
print(result_2000)
print("AVG accuracy "+ str(acc_2000))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:04<00:00,  4.94it/s]


['There is no error in the query.', 'The results of the experiment show that the version of the Forms Copilot that leverages all data sources and responses, uses chain of thought prompting, asks the model to cite the sources, and provides a description of the form has achieved an average of 2.41 / 5 G-EVAL score and 0.71 F1 BERTscore, indicating that the suggestions are similar to the ground truth. \n\nThe corrected query is: The results of the experiment show that the version of the Forms Copilot that leverages all data sources and responses, uses chain of thought prompting, asks the model to cite the sources, and provides a description of the form has achieved an average of 4.89 / 5 G-EVAL score and 0.71 F1 BERTscore, indicating that the suggestions are similar to the ground truth.', 'There is no error in the given query.', 'After contextual documents are retrieved or generated, we employ a FiD reader with 770M parameter models (i.e., FiD-l) and 8B parameter models (i.e., FiD-xl) tha

In [36]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 400, chunk_overlap = 100)
chunks_400 = text_splitter.split_documents(documents)
print(len(chunks_400))
db_400 = Chroma.from_documents(chunks_400, embeddings_client, persist_directory = "../papers/vectordb/recursive_400")

907


In [37]:
retriever_400 = db_400.as_retriever()
result_400, acc_400 = run_and_evaluate(retriever_400, prompt, llm)
print("CHUNK SIZE 400")
print(result_400)
print("AVG accuracy "+ str(acc_400))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:03<00:00,  5.02it/s]


["In CRUD, the letter 'C' stands for create, 'R' stands for read, 'U' stands for update and 'D' stands for delete.", 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'No error found in the query.', 'This metacognition space primarily encompasses three main phases: (1) Monitoring; (2) Evaluating; (3) Planning.', 'Response Generation Finally, for response generation, Kim et al. suggest to use an autoregressive sequence generation model conditioned on the dialog context and the knowledge snippet selected in the previous subtask. Kim et al. propose to fine-tune a pre-trained GPT-2 model (Radford et al. 2019) for this task.', 'The query is correct and does not contain any errors.', 'There is no error in the given query.', 'No error found.']
CHUNK SIZE 400
{'context_precision': 0.9250, 'context_recall': 0.8833}
AVG accuracy 0.3


### now time to look for different top-k

Note: We continue with the size chunk of 2000 as it had the highest average score

In [38]:
retriever_2 = db_400.as_retriever(search_kwargs={"k": 2})
result_2, acc_2 = run_and_evaluate(retriever_2, prompt, llm)
print("CHUNK SIZE 1000, K=2")
print(result_2)
print("AVG accuracy "+ str(acc_2))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:03<00:00,  6.18it/s]


['There is no error in the given query.', 'There is no error in the query.', 'No error found in the query.', 'There is no error in the query.', 'There is no error in the given query.', 'This metacognition space primarily encompasses three main phases: (1) Monitoring; (2) Evaluating; (3) Planning.', "Response Generation Finally, for response generation, Kim et al. suggest to use an autoregressive sequence generation model conditioned on the dialog context and the knowledge snippet selected in the previous subtask. Kim et al. propose to fine-tune a pre-trained GPT-2 model (Radford et al. 2019) for this task. We fine-tune a pre-trained BART model (Lewis et al. 2020a) which is, unlike GPT-2, an encoder-decoder model. In contrast to Kim et al. who use Nucleus' model that uses separate encoders, one for the dialogue context and one for knowledge documents. Relevant documents are retrieved by named entity and keyword matching. Kim, Ahn, and Kim (2020) jointly model knowledge selection and res

In [39]:
retriever_3 = db_400.as_retriever(search_kwargs={"k": 3})
result_3, acc_3 = run_and_evaluate(retriever_3, prompt, llm)
print("CHUNK SIZE 500, K=3")
print(result_3)
print("AVG accuracy "+ str(acc_3))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:04<00:00,  4.95it/s]


['There is no error in the given query.', 'The error in the query is that it states "an average of 4.89 / 5 G-EVAL score" instead of "an average of 2.41 / 5 G-EVAL score". The corrected query is:\n\nThe results of the experiment show that the version of the Forms Copilot that leverages all data sources and responses, uses chain of thought prompting, asks the model to cite the sources, and provides a description of the form has achieved an average of 2.41 / 5 G-EVAL score and 0.71 F1 BERTscore, indicating that the suggestions are similar to the ground truth.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'This metacognition space primarily encompasses three main phases: (1) Monitoring; (2) Evaluating; (3) Planning.', 'Response Generation Finally, for response generation, Kim et al. suggest to use an autoregressive sequence generation model conditioned on the dialog context and the knowledge snippet selected in

In [40]:
retriever_5 = db_400.as_retriever(search_kwargs={"k": 5})
result_5, acc_5 = run_and_evaluate(retriever_5, prompt, llm)
print("CHUNK SIZE 500, K=5")
print(result_5)
print("AVG accuracy "+ str(acc_5))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:04<00:00,  4.71it/s]


['There is no error in the given query.', 'There is no error in the given query.', 'No error found in the query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'Response Generation Finally, for response generation, Kim et al. suggest to use an autoregressive sequence generation model conditioned on the dialog context and the knowledge snippet selected in the previous subtask. Kim et al. propose to fine-tune a pre-trained GPT-2 model (Radford et al. 2019) for this task.', 'The query is already correct. No error found.', 'There is no error in the given query.', 'No error found in the query.']
CHUNK SIZE 500, K=5
{'context_precision': 0.9221, 'context_recall': 0.8500}
AVG accuracy 0.1


In [41]:
retriever_6 = db_400.as_retriever(search_kwargs={"k": 6})
result_6, acc_6 = run_and_evaluate(retriever_6, prompt, llm)
print("CHUNK SIZE 500, K=6")
print(result_6)
print("AVG accuracy "+ str(acc_6))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:04<00:00,  4.40it/s]


['There is no error in the given query.', 'There is no error in the given query.', 'No error found in the query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'The input query is already correct. No error found.', 'There is no error in the given query.', 'There is no error in the given query.']
CHUNK SIZE 500, K=6
{'context_precision': 0.8960, 'context_recall': 0.8550}
AVG accuracy 0.0


In [42]:
retriever_7 = db_400.as_retriever(search_kwargs={"k": 7})
result_7, acc_7 = run_and_evaluate(retriever_7, prompt, llm)
print("CHUNK SIZE 500, K=7")
print(result_7)
print("AVG accuracy "+ str(acc_7))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:04<00:00,  4.17it/s]


['There is no error in the given query.', 'There is no error in the given query.', 'No error found.', 'There is no error in the given query.', 'There is no error in the given query.', 'This metacognition space primarily encompasses three main phases: (1) Monitoring; (2) Evaluating; (3) Planning.', 'There is no error in the given query.', 'The query is already correct. No error found.', 'No error found in the query.', 'No error found in the query.']
CHUNK SIZE 500, K=7
{'context_precision': 0.8854, 'context_recall': 1.0000}
AVG accuracy 0.1


### look for different retrievers

3 chunks was the best score

#### parent document retriever

In [43]:
parent_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

# vectorstore = Chroma(collection_name="split_parents", persist_directory = "../papers/vectordb/parent_summary", embedding_function=embeddings_client)
# vectorstore.persist()
vectorstore = Chroma(persist_directory = "../papers/vectordb/parent_summary", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter
)
parent_document_retriever.add_documents(documents)
result_parent, acc_parent = run_and_evaluate(parent_document_retriever, prompt, llm)
print(result_parent)
print("AVG accuracy "+ str(acc_parent))


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  6.78it/s]


['There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'Response Generation Finally, for response generation, Kim et al. suggest to use an autoregressive sequence generation model conditioned on the dialog context and the knowledge snippet selected in the previous subtask. Kim et al. propose to fine-tune a pre-trained GPT-2 model (Radford et al. 2019) for this task.', 'There is no error in the given query.', 'There is no error in the given query.', 'No error found.']
{'context_precision': 0.9000, 'context_recall': 0.9000}
AVG accuracy 0.1


In [44]:
parent_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap = 50)
child_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap = 0)

# vectorstore = Chroma(collection_name="split_parents_small",persist_directory = "../papers/vectordb/parent_small_summary", embedding_function=embeddings_client)
# vectorstore.persist()
vectorstore = Chroma(persist_directory = "../papers/vectordb/parent_small_summary", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_small = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_small,
    parent_splitter=parent_splitter_small
)
parent_document_retriever_small.add_documents(documents)
result_parent_small, acc_parent_small = run_and_evaluate(parent_document_retriever_small, prompt, llm)
print(result_parent_small)
print("AVG accuracy "+ str(acc_parent_small))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:04<00:00,  4.97it/s]


['No error found in the query.', 'The query is already correct. No error found.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'The query is correct. No error found.', 'There is no error in the given query.', 'No error found in the given query.']
{'context_precision': 0.9250, 'context_recall': 0.9500}
AVG accuracy 0.0


In [45]:
parent_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=3000)
child_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=300, chunk_overlap = 0)

# vectorstore = Chroma(collection_name="split_parents_large", persist_directory = "../papers/vectordb/parent_large_summary", embedding_function=embeddings_client)
# vectorstore.persist()
vectorstore = Chroma(persist_directory = "../papers/vectordb/parent_large_summary", embedding_function=embeddings_client)

store = InMemoryStore()
parent_document_retriever_large = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_large,
    parent_splitter=parent_splitter_large
)
parent_document_retriever_large.add_documents(documents)
result_parent_large, acc_parent_large = run_and_evaluate(parent_document_retriever_large, prompt, llm)
print(result_parent_large)
print("AVG accuracy "+ str(acc_parent_large))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:03<00:00,  5.74it/s]


["In CRUD, the letter 'C' stands for create, 'R' stands for read, 'U' stands for update and 'D' stands for delete.", 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'The input query is already correct. No error needs to be fixed.', 'There is no error in the given query.', 'No error found.']
{'context_precision': 0.9167, 'context_recall': 0.8000}
AVG accuracy 0.1


#### Maximum marginal relevance retrieval

In [46]:
retriever_mmr = db_400.as_retriever(search_type="mmr")
result_mmr, acc_mmr = run_and_evaluate(retriever_mmr, prompt, llm)
print("Marginal relevance")
print(result_mmr)
print("AVG accuracy "+ str(acc_mmr))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:03<00:00,  6.16it/s]


['There is no error in the given query.', 'The error in the query is that it states "an average of 4.89 / 5 G-EVAL score" instead of "an average of 2.41 / 5 G-EVAL score". The corrected query is:\n\nThe results of the experiment show that the version of the Forms Copilot that leverages all data sources and responses, uses chain of thought prompting, asks the model to cite the sources, and provides a description of the form has achieved an average of 2.41 / 5 G-EVAL score and 0.71 F1 BERTscore, indicating that the suggestions are similar to the ground truth.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'This metacognition space primarily encompasses three main phases: (1) Monitoring; (2) Evaluating; (3) Planning.', 'There is no error in the given query.', 'The query is correct and does not contain any errors.', 'There is no error in the given query.', 'No error found in the query.']
Marginal relevance
{'cont

#### BM25

In [47]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 400, chunk_overlap = 50)
chunks_400 = text_splitter.split_documents(documents)

In [48]:
retriever_bm25 = BM25Retriever.from_documents(chunks_400)
result_bm25, acc_bm25 = run_and_evaluate(retriever_bm25, prompt, llm)
print("BM25")
print(result_bm25)
print("AVG accuracy "+ str(acc_bm25))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  6.78it/s]


['There is no error in the given query.', 'The error in the query is that it states "an average of 4.89 / 5 G-EVAL score", whereas the correct value is "an average of 2.41 / 5 G-EVAL score". \n\nThe corrected query is: The results of the experiment show that the version of the Forms Copilot that leverages all data sources and responses, uses chain of thought prompting, asks the model to cite the sources, and provides a description of the form has achieved an average of 2.41 / 5 G-EVAL score and 0.71 F1 BERTscore, indicating that the suggestions are similar to the ground truth.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'No error found.', 'The HybridRAG framework is composed of four primary components: an augmentation coordinator (client), a client-side memory-augmented model (client), a retriever model (cloud), and a memory generator based on Large Language Models 

#### Ensambler - Hybrid

In [49]:
ensemble_retriever_1 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_400], weights=[0.75, 0.25])
result_ensemble1, acc_retriever_1 = run_and_evaluate(ensemble_retriever_1, prompt, llm)
print("Ensambler 75/25")
print(result_ensemble1)
print("AVG accuracy "+ str(acc_retriever_1))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:05<00:00,  3.81it/s]


['There is no error in the given query.', 'The query is already correct. No error found.', 'The query is already correct. No error found.', 'There is no error in the given query.', 'There is no error in the given query.', 'No error found.', 'There is no error in the given query.', 'The input query is already correct. No error found.', 'There is no error in the given query.', 'No error found.']
Ensambler 75/25
{'context_precision': 0.7935, 'context_recall': 0.9500}
AVG accuracy 0.0


In [51]:
ensemble_retriever_2 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_400], weights=[0.5, 0.5])
result_ensemble2, acc_ens_2 = run_and_evaluate(ensemble_retriever_2, prompt, llm)
print("Ensambler 50/50")
print(result_ensemble2)
print("AVG accuracy "+ str(acc_ens_2))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:05<00:00,  3.41it/s]


['There is no error in the given query.', 'The query is already correct. No error found.', 'No error found in the query.', 'The query is already correct. No error found.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'No error found.']
Ensambler 50/50
{'context_precision': 0.9252, 'context_recall': 0.9500}
AVG accuracy 0.0


In [50]:
ensemble_retriever_3 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_400], weights=[0.25,0.75])
result_ensemble3, acc_ens_3 = run_and_evaluate(ensemble_retriever_3, prompt, llm)
print("Ensambler 25/75")
print(result_ensemble3)
print("AVG accuracy "+ str(acc_ens_3))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:06<00:00,  3.23it/s]


['There is no error in the given query.', 'The query is already correct. No error found.', 'No error found.', 'The query is already correct. No error found.', 'No error found in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'No error found.']
Ensambler 25/75
{'context_precision': 0.9224, 'context_recall': 0.9500}
AVG accuracy 0.0


In [52]:
ensemble_retriever_4 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_mmr], weights=[0.75, 0.25])
result_ensemble4, acc_retriever_4 = run_and_evaluate(ensemble_retriever_4, prompt, llm)
print("Ensambler 75/25")
print(result_ensemble4)
print("AVG accuracy "+ str(acc_retriever_4))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:05<00:00,  3.68it/s]


['There is no error in the given query.', 'The query is already correct. No error found.', 'The query is already correct. No error found.', 'No error found in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'No error found.']
Ensambler 75/25
{'context_precision': 0.7725, 'context_recall': 0.8750}
AVG accuracy 0.0


In [53]:
ensemble_retriever_5 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_mmr], weights=[0.5, 0.5])
result_ensemble5, acc_retriever_5 = run_and_evaluate(ensemble_retriever_5, prompt, llm)
print("Ensambler 50/50")
print(result_ensemble5)
print("AVG accuracy "+ str(acc_retriever_5))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:05<00:00,  3.53it/s]


['There is no error in the given query.', 'The query is already correct. No error found.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'There is no error in the given query.', 'The query is already correct.', 'There is no error in the given query.', 'No error found.']
Ensambler 50/50
{'context_precision': 0.9178, 'context_recall': 0.9000}
AVG accuracy 0.0


In [54]:
ensemble_retriever_6 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_mmr], weights=[0.25, 0.75])
result_ensemble6, acc_retriever_6 = run_and_evaluate(ensemble_retriever_6, prompt, llm)
print("Ensambler 25/75")
print(result_ensemble6)
print("AVG accuracy "+ str(acc_retriever_6))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:06<00:00,  3.03it/s]


['There is no error in the given query.', 'The query is already correct. No error found.', 'No error found.', 'The query does not have any errors.', 'No error found.', 'There is no error in the given query.', 'There is no error in the given query.', 'The query is already correct. No error found.', 'There is no error in the given query.', 'No error found.']
Ensambler 25/75
{'context_precision': 0.8446, 'context_recall': 1.0000}
AVG accuracy 0.0


#### Multi-stage - reranker

In [55]:
os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")

In [56]:
retriever_context = retriever_400
compressor = CohereRerank()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever_context
)

result_compression, acc_compression = run_and_evaluate(compression_retriever, prompt, llm)
print("Reranker")
print(result_compression)
print("AVG accuracy "+ str(acc_compression))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  8.55it/s]


['There is no error in the given query.', 'The query is already correct.', 'There is no error in the given query.', 'There is no error in the given query.', 'No error found.', 'This metacognition space primarily encompasses three main phases: (1) Monitoring; (2) Evaluating; (3) Planning.', 'Response Generation Finally, for response generation, Kim et al. suggest to use an autoregressive sequence generation model conditioned on the dialog context and the knowledge snippet selected in the previous subtask. Kim et al. propose to fine-tune a pre-trained GPT-2 model (Radford et al. 2019) for this task.', 'The query is already correct. No error found.', 'There is no error in the given query.', 'No error found.']
Reranker
{'context_precision': 0.9833, 'context_recall': 0.9083}
AVG accuracy 0.2


#### creating context by remaking the query

In [57]:
template_context = "Generate a search query to fetch the relevant documents using the user's {question}. Craft a query that specifically targets the keywords in the question. In the answer provide only the query."
prompt_context = ChatPromptTemplate.from_template(template_context)

In [58]:
answers_final = []
contexts_final = []
llm_for_context =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt_context | llm}
)
for query in questions:
    response_check = llm_for_context.invoke({"question": query})
    search_query = response_check["response"].content
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("context"), "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
)
    docs = retriever_400.get_relevant_documents(search_query)
    formatted_docs = []
    for doc in docs:
        resulting_doc = doc.page_content
        formatted_docs.append(resulting_doc)
    try:  
            response = retrieval_augmented_qa_chain.invoke({"context": formatted_docs, "question": query})
            # Access the response content
            answers_final.append(response["response"].content)
            contexts_final.append(formatted_docs)  
    except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_final.append("No answer")
            contexts_final.append(formatted_docs)


result_search_query = evaluation_rag(questions, answers_final, contexts_final, ground_truths)
acc_search_query = calculate_accuracy(answers_final, ground_truths)
print("AVG accuracy "+ str(acc_search_query))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:04<00:00,  4.31it/s]


AVG accuracy 0.1


### change model to GPT-4

In [59]:
result_400_gpt4, acc_400_gpt4 = run_and_evaluate(retriever_400, prompt, llm_gpt4)
print("CHUNK SIZE 400")
print(result_400_gpt4)
print("AVG accuracy "+ str(acc_400_gpt4))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  7.50it/s]


["In CRUD, the letter 'C' stands for create, 'R' stands for read, 'U' stands for update and 'D' stands for delete.", 'The results of the experiment show that the version of the Forms Copilot that leverages all data sources and responses, uses chain of thought prompting, asks the model to cite the sources, and provides a description of the form has achieved an average of 2.41 / 5 G-EVAL score and 0.71 F1 BERTscore, indicating that the suggestions are similar to the ground truth.', 'Model Layer: This particular layer is instrumental in powering AI products delivered via either proprietary APIs or open-source checkpoints (which necessitate hosting solutions). Within the Foundation Models category, there exist both proprietary models from private sources (such as GPT-4) and open-source models (like Stable Diffusion). Additionally, there are model hubs dedicated to the sharing and hosting of Foundation Models.', 'After contextual documents are retrieved or generated, we employ a FiD reader 

In [60]:
result_compression_gpt4, acc_compression_gpt4 = run_and_evaluate(compression_retriever, prompt, llm_gpt4)
print("Reranker")
print(result_compression_gpt4)
print("AVG accuracy "+ str(acc_compression_gpt4))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:05<00:00,  3.81it/s]


["In CRUD, the letter 'C' stands for create, 'R' stands for read, 'U' stands for update and 'D' stands for delete.", 'The results of the experiment show that the version of the Forms Copilot that leverages all data sources and responses, uses chain of thought prompting, asks the model to cite the sources, and provides a description of the form has achieved an average of 2.41 / 5 G-EVAL score and 0.71 F1 BERTscore, indicating that the suggestions are similar to the ground truth.', 'Model Layer: This particular layer is instrumental in powering AI products delivered via either proprietary APIs or open-source checkpoints (which necessitate hosting solutions). Within the Foundation Models category, there exist both proprietary models from private sources (such as GPT-4) and open-source models (like Stable Diffusion). Additionally, there are model hubs dedicated to the sharing and hosting of Foundation Models.', 'After contextual documents are retrieved or generated, we employ a FiD reader 

In [61]:
result_3_gpt4, acc_3_gpt4 = run_and_evaluate(retriever_3, prompt, llm_gpt4)
print("Ensambler 75/25")
print(result_3_gpt4)
print("AVG accuracy "+ str(acc_3_gpt4))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:04<00:00,  4.05it/s]


["In CRUD, the letter 'C' stands for create, 'R' stands for read, 'U' stands for update and 'D' stands for delete.", 'The results of the experiment show that the version of the Forms Copilot that leverages all data sources and responses, uses chain of thought prompting, asks the model to cite the sources, and provides a description of the form has achieved an average of 2.41 / 5 G-EVAL score and 0.71 F1 BERTscore, indicating that the suggestions are similar to the ground truth.', 'Model Layer: This particular layer is instrumental in powering AI products delivered via either proprietary APIs or open-source checkpoints (which necessitate hosting solutions). Within the Foundation Models category, there exist both proprietary models from private sources (such as GPT-4) and open-source models (like Stable Diffusion). Additionally, there are model hubs dedicated to the sharing and hosting of Foundation Models.', 'After contextual documents are retrieved or generated, we employ a FiD reader 

In [64]:
result_500_gpt4, acc_500_gpt4 = run_and_evaluate(retriever_500, prompt, llm_gpt4)
print("CHUNK SIZE 500 gpt4")
print(result_500_gpt4)
print("AVG accuracy "+ str(acc_500_gpt4))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:03<00:00,  6.05it/s]


CHUNK SIZE 500 gpt4
{'context_precision': 0.8222, 'context_recall': 1.0000}
AVG accuracy 0.5


In [65]:
result_mmr_gpt4, acc_mmr_gpt4 = run_and_evaluate(retriever_mmr, prompt, llm_gpt4)
print("Marginal relevance")
print(result_mmr_gpt4)
print("AVG accuracy "+ str(acc_mmr_gpt4))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:05<00:00,  3.68it/s]


["In CRUD, the letter 'C' stands for create, 'R' stands for read, 'U' stands for update and 'D' stands for delete.", 'The results of the experiment show that the version of the Forms Copilot that leverages all data sources and responses, uses chain of thought prompting, asks the model to cite the sources, and provides a description of the form has achieved an average of 2.41 / 5 G-EVAL score and 0.71 F1 BERTscore, indicating that the suggestions are similar to the ground truth.', 'Model Layer: This particular layer is instrumental in powering AI products delivered via either proprietary APIs or open-source checkpoints (which necessitate hosting solutions). Within the Foundation Models category, there exist both proprietary models from private sources (such as GPT-4) and open-source models (like Stable Diffusion). Additionally, there are model hubs dedicated to the sharing and hosting of Foundation Models.', 'After contextual documents are retrieved or generated, we employ a FiD reader 

In [66]:
result_bm25_gpt4, acc_bm25_gpt4 = run_and_evaluate(retriever_bm25, prompt, llm_gpt4)
print("BM25")
print(result_bm25_gpt4)
print("AVG accuracy "+ str(acc_bm25_gpt4))

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  6.95it/s]


BM25
{'context_precision': 0.8722, 'context_recall': 1.0000}
AVG accuracy 0.6
