In [1]:
import dotenv
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain_openai import AzureOpenAIEmbeddings
dotenv.load_dotenv()
from langchain_community.vectorstores import Chroma
from ragas import evaluate
from ragas.metrics import (
    context_recall,
    context_precision,
)
from datasets import Dataset
from langchain_openai import AzureChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
import os
import sys
sys.tracebacklimit = 0
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
import getpass
from langchain.retrievers.document_compressors import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION")
AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
OPENAI_MODEL = os.environ.get("OPENAI_MODEL")
OPENAI_DEPLOYMENT = os.environ.get("OPENAI_DEPLOYMENT")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL")
EMBEDDING_DEPLOYMENT = os.environ.get("EMBEDDING_DEPLOYMENT")
OPENAI_MODEL_GPT4 = os.environ.get("OPENAI_MODEL_GPT4")
OPENAI_DEPLOYMENT_GPT4 = os.environ.get("OPENAI_DEPLOYMENT_GPT4")

In [3]:
questions = [
    "In CRUD, the letter 'C' stands for create, 'R' stands for read, 'U' stands for upload and 'D' stands for delete.",
    "The results of the experiment show that the version of the Forms Copilot that leverages all data sources and responses, uses chain of thought prompting, asks the model to cite the sources, and provides a description of the form has achieved an average of 4.89 / 5 G-EVAL score and 0.71 F1 BERTscore, indicating that the suggestions are similar to the ground truth.",
    "Model Layer: This particular layer is instrumental in powering AI products delivered via either proprietary APIs or open-source checkpoints (which necessitate hosting solutions). Within the Foundation Models category, there exist both proprietary models from private sources (such as chatGPT) and open-source models (like Stable Diffusion). Additionally, there are model hubs dedicated to the sharing and hosting of Foundation Models.",
    "After contextual documents are retrieved or generated, we employ a FiD reader with 770M parameter models (i.e., FiD-l) and 8B parameter models (i.e., FiD-xl) that are fine-tuned on the training split of target datasets.",
    "Pessimistic Rejection serves as an evaluation metric to assess the ability of Language Models (LLMs) to refuse answering a question in instances where none of the provided contexts offer pertinent information. In practical scenarios, search engines frequently encounter difficulties in retrieving documents that contain the required answers. In such situations, it becomes crucial for the model to possess the capability to decline recognition, thereby preventing the generation of inaccurate or misleading content.",
    "This metacognition space primarily encompasses three main phases: (1) Deduction; (2) Evaluating; (3) Planning.",
    "Response Generation Finally, for response generation, Kim et al. suggest to use an autoregressive sequence generation model conditioned on the dialog context and the knowledge snippet selected in the previous subtask. Kim et al. propose to fine-tune a pre-trained GPT-4 model (Radford et al. 2019) for this task.",
    "The HybridRAG framework is composed of four primary components: an augmentation coordinator (client), a client-side memory-augmented model (client), a retriever model (on-prem), and a memory generator based on Large Language Models (LLM) (cloud).",
    "A general query is characterized by the necessity to retrieve and reason over multiple pieces of supporting evidence in order to furnish an answer. To put it differently, for a given multi-hop query denoted as q, the components within the retrieval set Rq collaboratively supply an answer to q.",
    "Examples of retrieved documents Table 3 shows an example of the REALM masked language model prediction. In this example, “Fermat” is the correct word, and REALITY (row (c)) gives the word a much high probability compared to the BERT model (row (a)). Since REALM manages to retrieve some documents with a related fact (row (b)), the marginalized probability of the correct answer dramatically increases. This shows that REALM is able to retrieve document to fill in the masked word even though it is trained with unsupervised text only."
]

ground_truths = [
    ["In CRUD, the letter 'C' stands for create, 'R' stands for read, 'U' stands for update and 'D' stands for delete."],
    ["The results of the experiment show that the version of the Forms Copilot that leverages all data sources and responses, uses chain of thought prompting, asks the model to cite the sources, and provides a description of the form has achieved an average of 2.41 / 5 G-EVAL score and 0.71 F1 BERTscore, indicating that the suggestions are similar to the ground truth."],
    ["Model Layer: This particular layer is instrumental in powering AI products delivered via either proprietary APIs or open-source checkpoints (which necessitate hosting solutions). Within the Foundation Models category, there exist both proprietary models from private sources (such as GPT-4) and open-source models (like Stable Diffusion). Additionally, there are model hubs dedicated to the sharing and hosting of Foundation Models."],
    ["After contextual documents are retrieved or generated, we employ a FiD reader with 770M parameter models (i.e., FiD-l) and 3B parameter models (i.e., FiD-xl) that are fine-tuned on the training split of target datasets."],
    ["Negative Rejection serves as an evaluation metric to assess the ability of Language Models (LLMs) to refuse answering a question in instances where none of the provided contexts offer pertinent information. In practical scenarios, search engines frequently encounter difficulties in retrieving documents that contain the required answers. In such situations, it becomes crucial for the model to possess the capability to decline recognition, thereby preventing the generation of inaccurate or misleading content."],
    ["This metacognition space primarily encompasses three main phases: (1) Monitoring; (2) Evaluating; (3) Planning."],
    ["Response Generation Finally, for response generation, Kim et al. suggest to use an autoregressive sequence generation model conditioned on the dialog context and the knowledge snippet selected in the previous subtask. Kim et al. propose to fine-tune a pre-trained GPT-2 model (Radford et al. 2019) for this task."],
    ["The HybridRAG framework is composed of four primary components: an augmentation coordinator (client), a client-side memory-augmented model (client), a retriever model (cloud), and a memory generator based on Large Language Models (LLM) (cloud)."],
    ["A multi-hop query is characterized by the necessity to retrieve and reason over multiple pieces of supporting evidence in order to furnish an answer. To put it differently, for a given multi-hop query denoted as q, the components within the retrieval set Rq collaboratively supply an answer to q."],
    ["Examples of retrieved documents Table 3 shows an example of the REALM masked language model prediction. In this example, “Fermat” is the correct word, and REALM (row (c)) gives the word a much high probability compared to the BERT model (row (a)). Since REALM manages to retrieve some documents with a related fact (row (b)), the marginalized probability of the correct answer dramatically increases. This shows that REALM is able to retrieve document to fill in the masked word even though it is trained with unsupervised text only."]
]


In [4]:
embeddings_client = AzureOpenAIEmbeddings(
    azure_deployment=EMBEDDING_DEPLOYMENT,
    openai_api_version=OPENAI_API_VERSION)
llm = AzureChatOpenAI(model_name=OPENAI_MODEL, azure_deployment=OPENAI_DEPLOYMENT,temperature=0)
llm_gpt4 = AzureChatOpenAI(model_name=OPENAI_MODEL_GPT4, azure_deployment=OPENAI_DEPLOYMENT_GPT4,temperature=0)

In [5]:
def evaluation_rag(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            context_precision,
            context_recall,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
        raise_exceptions=False,
    )
    return result

In [6]:
def flatten_list(lst):
    return [item.replace('"', '').replace("'", '').replace('“','').replace('”', '') for sublist in lst for item in (sublist if isinstance(sublist, list) else [sublist])]

def calculate_accuracy(answers, ground_truths):
    # Convert both answers and ground truths to lowercase for case-insensitive comparison
    answers_lower = flatten_list([answer.lower() for answer in answers])
    ground_truths_lower = flatten_list([[truth.lower() for truth in sublist] for sublist in ground_truths])
    # Check if each answer corresponds with the ground truth exactly
    accuracy = [1 if ans == truth else 0 for ans, truth in zip(answers_lower, ground_truths_lower)]
    # Calculate the average accuracy
    average_accuracy = sum(accuracy) / len(accuracy)
    return average_accuracy

In [7]:
columns = ["System", "Context Precision", "Context Recall", "Accuracy"]
results_df = pd.DataFrame(columns=columns)

In [8]:
def evaluate_system(system_name, questions, answers, contexts, ground_truths):
    result = evaluation_rag(questions, answers, contexts, ground_truths)
    accuracy = calculate_accuracy(answers, ground_truths)
    # Create a dictionary to store the results
    system_results = {
        "System": system_name,
        "Context Precision": result["context_precision"],
        "Context Recall": result["context_recall"],
        "Accuracy": accuracy
    }
    df_system_results = pd.DataFrame([system_results])
    return df_system_results

In [9]:
def evaluate_LLM(system_name, questions, answers, contexts, ground_truths):
    accuracy = calculate_accuracy(answers, ground_truths)
    # Create a dictionary to store the results
    system_results = {
        "System": system_name,
        "Context Precision": np.nan,
        "Context Recall": np.nan,
        "Accuracy": accuracy
    }
    df_llm_results = pd.DataFrame([system_results])
    return df_llm_results

### General answers by LLMs

In [10]:
template = """ Fix an error in the following query. The answer should be exactly the same as query but with corrected mistake. For example:
'input': 'There are 10 continents in the world.',
'output': 'There are 7 continents in the world.',
'input': 'The elephant is the largest animal of all time.',
'output': 'The blue whale is the largest animal of all time.'.
This is the input query: {question}"""
prompt = ChatPromptTemplate.from_template(template)

In [11]:
llm_chain =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm}
)
llm_chain_gpt4 =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm_gpt4}
)

In [12]:
answers_llm = []
contexts_llm = [[""],[""],[""],[""],[""],[""],[""],[""],[""],[""]]

In [13]:
for query in questions:
    response = llm_chain.invoke({"question": query})
    answers_llm.append(response["response"].content)

In [14]:
llm_results = evaluate_LLM("GPT-3.5", questions, answers_llm, contexts_llm, ground_truths)
results_df = pd.concat([results_df, llm_results], ignore_index=True)
print(llm_results)

    System  Context Precision  Context Recall  Accuracy
0  GPT-3.5                NaN             NaN       0.0


  results_df = pd.concat([results_df, llm_results], ignore_index=True)


In [15]:
answers_llm_gpt4 = []
for query in questions:
    response = llm_chain_gpt4.invoke({"question": query})
    answers_llm_gpt4.append(response["response"].content)
llm_results_gpt4 = evaluate_LLM("GPT-4",questions, answers_llm_gpt4, contexts_llm, ground_truths)
results_df = pd.concat([results_df, llm_results_gpt4], ignore_index=True)
print(llm_results_gpt4)

  System  Context Precision  Context Recall  Accuracy
0  GPT-4                NaN             NaN       0.3


### Naive RAG

In [16]:
def retrieval_chain(prompt, retriever, llm):
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
    )
    return retrieval_augmented_qa_chain

In [17]:
loader = DirectoryLoader('../news', glob="./*.txt", loader_cls=TextLoader)
documents = loader.load()

In [18]:
# text_splitter = CharacterTextSplitter()
# chunks = text_splitter.split_documents(documents)
# db_naive = Chroma.from_documents(chunks, embeddings_client, persist_directory = "../papers/vectordb/naive")
# db_naive.persist()
# retriever_naive = db_naive.as_retriever()

In [19]:
db_naive = Chroma(persist_directory = "../papers/vectordb/naive", embedding_function=embeddings_client)
retriever_naive = db_naive.as_retriever()

In [20]:
template = """Fix an error in the input query. The answer should be exactly the same as query but with corrected mistake. For example:
'input': 'There are 10 continents in the world.',
'output': 'There are 7 continents in the world.',
'input': 'The elephant is the largest animal of all time.',
'output': 'The blue whale is the largest animal of all time.'.
This is the input query: {question}. 
Here is some provided context {context}."""
prompt = ChatPromptTemplate.from_template(template)

In [21]:
answers_naive = []
contexts_naive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_naive, llm).invoke({"question": query})
        # Access the response content
        answers_naive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_naive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_naive.append("No answer")
        context_full = retriever_naive.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_naive.append(context_content)

In [22]:
result_naive_rag = evaluate_system("Naive", questions, answers_naive, contexts_naive, ground_truths)
results_df = pd.concat([results_df, result_naive_rag], ignore_index=True)
print(result_naive_rag)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  7.06it/s]


  System  Context Precision  Context Recall  Accuracy
0  Naive              0.825             1.0       0.1


### recursive splitter

In [23]:
# text_splitter = text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder()
# chunks_r = text_splitter.split_documents(documents)
# db_basic = Chroma.from_documents(chunks_r, embeddings_client, persist_directory = "../papers/vectordb/recursive_basic")
# db_basic.persist()
# retriever_basic = db_basic.as_retriever()

In [24]:
db_basic = Chroma(persist_directory = "../papers/vectordb/recursive_basic", embedding_function=embeddings_client)
retriever_basic = db_basic.as_retriever()

In [25]:
answers_recursive = []
contexts_recursive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_basic, llm).invoke({"question": query})
        # Access the response content
        answers_recursive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_recursive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_recursive.append("No answer")
        context_full = retriever_basic.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_recursive.append(context_content)

In [26]:
result_recursive = evaluate_system("Recursive", questions, answers_naive, contexts_naive, ground_truths)
results_df = pd.concat([results_df, result_recursive], ignore_index=True)
result_recursive

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:03<00:00,  6.66it/s]


Unnamed: 0,System,Context Precision,Context Recall,Accuracy
0,Recursive,0.816667,1.0,0.1


### chunk sizes

In [27]:
def run_and_evaluate(name, retriever, prompt, llm, results_df):
    answers = []
    contexts_extra = []

    for query in questions:
        try:  
            response = retrieval_chain(prompt, retriever, llm).invoke({"question": query})
            # Access the response content
            answers.append(response["response"].content)
            # Access the context content
            context_content = [context.page_content for context in response["context"]]
            contexts_extra.append(context_content)  
        except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers.append("No answer")
            context_full = retriever.get_relevant_documents(query)
            context_content = [context.page_content for context in context_full]
            contexts_extra.append(context_content)

    result = evaluate_system(name, questions, answers, contexts_extra, ground_truths)
    results_df = pd.concat([results_df, result], ignore_index=True)
    return result, results_df

In [28]:
chunk_sizes = [500, 1000, 2000, 3000]
overlap_percentages = [0, 5, 10, 15, 20]

for chunk_size in chunk_sizes:
    for overlap_percentage in overlap_percentages:
        # Calculate overlap based on percentage
        chunk_overlap = int(chunk_size * overlap_percentage / 100)
        
        # Create text splitter
        # text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        
        # # Split documents
        # chunks = text_splitter.split_documents(documents)
        
        # Print number of chunks
        print(f"Chunk size {chunk_size}, overlap {overlap_percentage}%")
        
        # Create Chroma database
        # db = Chroma.from_documents(chunks, embeddings_client, persist_directory=f"../papers/vectordb-edit/chunking_{chunk_size}_{overlap_percentage}")
        db = Chroma(persist_directory = f"../papers/vectordb-edit/chunking_{chunk_size}_{overlap_percentage}", embedding_function=embeddings_client)
        db.persist()
        
        # Create retriever
        retriever = db.as_retriever()
        
        # Run and evaluate
        result,results_df = run_and_evaluate(f"Chunk {chunk_size}, overlap {overlap_percentage}%", retriever, prompt, llm, results_df)
        print(f"CHUNK SIZE {chunk_size}, {overlap_percentage}% overlap:")
        print(result)

Chunk size 500, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  7.23it/s]


CHUNK SIZE 500, 0% overlap:
                  System  Context Precision  Context Recall  Accuracy
0  Chunk 500, overlap 0%                0.8             1.0       0.2
Chunk size 500, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  6.96it/s]


CHUNK SIZE 500, 5% overlap:
                  System  Context Precision  Context Recall  Accuracy
0  Chunk 500, overlap 5%           0.808333             1.0       0.2
Chunk size 500, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  7.03it/s]


CHUNK SIZE 500, 10% overlap:
                   System  Context Precision  Context Recall  Accuracy
0  Chunk 500, overlap 10%           0.816667             0.9       0.4
Chunk size 500, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  7.23it/s]


CHUNK SIZE 500, 15% overlap:
                   System  Context Precision  Context Recall  Accuracy
0  Chunk 500, overlap 15%           0.872222             0.9       0.4
Chunk size 500, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  7.45it/s]


CHUNK SIZE 500, 20% overlap:
                   System  Context Precision  Context Recall  Accuracy
0  Chunk 500, overlap 20%           0.844444        0.966667       0.3
Chunk size 1000, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:03<00:00,  6.10it/s]


CHUNK SIZE 1000, 0% overlap:
                   System  Context Precision  Context Recall  Accuracy
0  Chunk 1000, overlap 0%           0.880556             1.0       0.2
Chunk size 1000, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  7.62it/s]


CHUNK SIZE 1000, 5% overlap:
                   System  Context Precision  Context Recall  Accuracy
0  Chunk 1000, overlap 5%           0.944444            0.95       0.2
Chunk size 1000, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  7.03it/s]


CHUNK SIZE 1000, 10% overlap:
                    System  Context Precision  Context Recall  Accuracy
0  Chunk 1000, overlap 10%           0.780556            0.95       0.2
Chunk size 1000, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  7.32it/s]


CHUNK SIZE 1000, 15% overlap:
                    System  Context Precision  Context Recall  Accuracy
0  Chunk 1000, overlap 15%           0.916667             0.8       0.3
Chunk size 1000, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  7.70it/s]


CHUNK SIZE 1000, 20% overlap:
                    System  Context Precision  Context Recall  Accuracy
0  Chunk 1000, overlap 20%           0.830556             0.9       0.3
Chunk size 2000, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  6.82it/s]


CHUNK SIZE 2000, 0% overlap:
                   System  Context Precision  Context Recall  Accuracy
0  Chunk 2000, overlap 0%           0.780556             0.9       0.1
Chunk size 2000, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:03<00:00,  6.16it/s]


CHUNK SIZE 2000, 5% overlap:
                   System  Context Precision  Context Recall  Accuracy
0  Chunk 2000, overlap 5%           0.797222             0.9       0.1
Chunk size 2000, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  6.82it/s]


CHUNK SIZE 2000, 10% overlap:
                    System  Context Precision  Context Recall  Accuracy
0  Chunk 2000, overlap 10%           0.772222             0.9       0.1
Chunk size 2000, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  7.32it/s]


CHUNK SIZE 2000, 15% overlap:
                    System  Context Precision  Context Recall  Accuracy
0  Chunk 2000, overlap 15%           0.772222             0.9       0.1
Chunk size 2000, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  6.76it/s]


CHUNK SIZE 2000, 20% overlap:
                    System  Context Precision  Context Recall  Accuracy
0  Chunk 2000, overlap 20%           0.847222             0.9       0.1
Chunk size 3000, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  6.98it/s]


CHUNK SIZE 3000, 0% overlap:
                   System  Context Precision  Context Recall  Accuracy
0  Chunk 3000, overlap 0%           0.847222             0.9       0.1
Chunk size 3000, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  6.81it/s]


CHUNK SIZE 3000, 5% overlap:
                   System  Context Precision  Context Recall  Accuracy
0  Chunk 3000, overlap 5%           0.830556             0.9       0.1
Chunk size 3000, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  6.70it/s]


CHUNK SIZE 3000, 10% overlap:
                    System  Context Precision  Context Recall  Accuracy
0  Chunk 3000, overlap 10%           0.797222             0.9       0.1
Chunk size 3000, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:03<00:00,  6.63it/s]


CHUNK SIZE 3000, 15% overlap:
                    System  Context Precision  Context Recall  Accuracy
0  Chunk 3000, overlap 15%              0.825             0.9       0.1
Chunk size 3000, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  6.82it/s]


CHUNK SIZE 3000, 20% overlap:
                    System  Context Precision  Context Recall  Accuracy
0  Chunk 3000, overlap 20%           0.880556             0.9       0.1


In [None]:
results_df.to_csv(f"../papers/results/results_errors.csv", index=False)

In [None]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 100)
# chunks_1000 = text_splitter.split_documents(documents)
# print(len(chunks_1000))
# db_1000 = Chroma.from_documents(chunks_1000, embeddings_client, persist_directory = "../papers/vectordb/recursive_1000")
# db_1000.persist()
# retriever_1000 = db_1000.as_retriever()
# result_1000_context = run_and_evaluate(retriever_1000, prompt, llm)
# print("CHUNK SIZE 1000")
# print(result_1000_context)

In [None]:
# db_1000 = Chroma(persist_directory = "../papers/vectordb/recursive_1000", embedding_function=embeddings_client)

In [None]:
# retriever_1000 = db_1000.as_retriever()
# result_1000_context, result_1000 = run_and_evaluate(retriever_1000, prompt, llm)
# print("CHUNK SIZE 1000")
# print(result_1000_context)
# print("AVG accuracy "+ str(result_1000))

In [None]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 500, chunk_overlap = 50)
# chunks_500 = text_splitter.split_documents(documents)
# print(len(chunks_500))
# db_500 = Chroma.from_documents(chunks_500, embeddings_client, persist_directory = "../vectordb/recursive_500")
# db_500.persist()
# retriever_500 = db_500.as_retriever()
# result_500 = run_and_evaluate(retriever_500, prompt, llm)
# print("CHUNK SIZE 500")
# print(result_500)

In [None]:
# db_500 = Chroma(persist_directory = "../papers/vectordb/recursive_500", embedding_function=embeddings_client)

In [None]:
# retriever_500 = db_500.as_retriever()
# result_500, acc_500 = run_and_evaluate(retriever_500, prompt, llm)
# print("CHUNK SIZE 500")
# print(result_500)
# print("AVG accuracy "+ str(acc_500))

In [None]:
# db_2000 = Chroma(persist_directory = "../papers/vectordb/recursive_2000", embedding_function=embeddings_client)

In [None]:
# retriever_2000 = db_2000.as_retriever()
# result_2000, acc_2000 = run_and_evaluate(retriever_2000, prompt, llm)
# print("CHUNK SIZE 2000")
# print(result_2000)
# print("AVG accuracy "+ str(acc_2000))

In [None]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 400, chunk_overlap = 100)
# chunks_400 = text_splitter.split_documents(documents)
# print(len(chunks_400))
# db_400 = Chroma.from_documents(chunks_400, embeddings_client, persist_directory = "../papers/vectordb/recursive_400")

In [None]:
# retriever_400 = db_400.as_retriever()
# result_400, acc_400 = run_and_evaluate(retriever_400, prompt, llm)
# print("CHUNK SIZE 2000")
# print(result_400)
# print("AVG accuracy "+ str(acc_400))

### now time to look for different top-k

Note: We continue with the size chunk of 500, 5% as it had the highest average score

In [29]:
highest = results_df.nlargest(1, "Accuracy")
highest

Unnamed: 0,System,Context Precision,Context Recall,Accuracy
6,"Chunk 500, overlap 10%",0.816667,0.9,0.4


In [30]:
db_k = Chroma(persist_directory = "../papers/vectordb-edit/chunking_500_10", embedding_function=embeddings_client)

In [31]:
k_values = [2, 3, 5]

# Iterate over different k values
for k in k_values:
    # Create retriever with k value
    retriever = db_k.as_retriever(search_kwargs={"k": k})
    
    # Run and evaluate
    result,results_df = run_and_evaluate(f"Chunk size 500, overlap 10%, K={k}", retriever, prompt, llm, results_df)
    print(f"Results for K={k}:")
    print(result)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  8.99it/s]


Results for K=2:
                             System  Context Precision  Context Recall  \
0  Chunk size 500, overlap 10%, K=2               0.85             1.0   

   Accuracy  
0       0.3  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  8.63it/s]


Results for K=3:
                             System  Context Precision  Context Recall  \
0  Chunk size 500, overlap 10%, K=3           0.841667             0.9   

   Accuracy  
0       0.3  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:03<00:00,  5.73it/s]


Results for K=5:
                             System  Context Precision  Context Recall  \
0  Chunk size 500, overlap 10%, K=5           0.819444             0.9   

   Accuracy  
0       0.3  


In [None]:
# retriever_2 = db_500.as_retriever(search_kwargs={"k": 2})
# result_2, acc_2 = run_and_evaluate(retriever_2, prompt, llm)
# print("CHUNK SIZE 1000, K=2")
# print(result_2)
# print("AVG accuracy "+ str(acc_2))

In [None]:
# retriever_3 = db_500.as_retriever(search_kwargs={"k": 3})
# result_3, acc_3 = run_and_evaluate(retriever_3, prompt, llm)
# print("CHUNK SIZE 500, K=3")
# print(result_3)
# print("AVG accuracy "+ str(acc_3))

In [None]:
# retriever_5 = db_500.as_retriever(search_kwargs={"k": 5})
# result_5, acc_5 = run_and_evaluate(retriever_5, prompt, llm)
# print("CHUNK SIZE 500, K=5")
# print(result_5)
# print("AVG accuracy "+ str(acc_5))

In [None]:
# retriever_6 = db_500.as_retriever(search_kwargs={"k": 6})
# result_6, acc_6 = run_and_evaluate(retriever_6, prompt, llm)
# print("CHUNK SIZE 500, K=6")
# print(result_6)
# print("AVG accuracy "+ str(acc_6))

### look for different retrievers



#### parent document retriever

In [32]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

parent_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap = 200)
child_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents",persist_directory = "../papers/vectordb-edit/parent_error", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)
parent_document_retriever.add_documents(documents)
result_parent, results_df = run_and_evaluate(f"Parent Retriever 1000-200", parent_document_retriever, prompt, llm, results_df)
print(result_parent)


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  8.42it/s]


                      System  Context Precision  Context Recall  Accuracy
0  Parent Retriever 1000-200           0.083333             0.4       0.1


In [33]:
parent_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap = 50)
child_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents_small",persist_directory = "../papers/vectordb-edit/parent_small_error", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_small = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_small,
    parent_splitter=parent_splitter_small,
)
parent_document_retriever_small.add_documents(documents)
result_parent_small, results_df = run_and_evaluate(f"Parent Retriever 500-100", parent_document_retriever_small, prompt, llm, results_df)
print(result_parent_small)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  8.35it/s]


                     System  Context Precision  Context Recall  Accuracy
0  Parent Retriever 500-100               0.15        0.133333       0.0


In [34]:
parent_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1500, chunk_overlap = 150)
child_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents_large",persist_directory = "../papers/vectordb-edit/large_error", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_large = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_large,
    parent_splitter=parent_splitter_large,
)
parent_document_retriever_large.add_documents(documents)
result_parent_large , results_df = run_and_evaluate(f"Parent Retriever 1500-200", parent_document_retriever_large, prompt, llm, results_df)
print(result_parent_large)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  55%|█████▌    | 11/20 [00:01<00:01,  7.09it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'seve

                      System  Context Precision  Context Recall  Accuracy
0  Parent Retriever 1500-200           0.092593        0.266667       0.1


#### Maximum marginal relevance retrieval

In [35]:
highest = results_df.nlargest(1, "Accuracy")
highest

Unnamed: 0,System,Context Precision,Context Recall,Accuracy
6,"Chunk 500, overlap 10%",0.816667,0.9,0.4


In [36]:
retriever_mmr = db_k.as_retriever(search_type="mmr")
result_mmr, results_df = run_and_evaluate(f"MMR", retriever_mmr, prompt, llm, results_df)
print("Marginal relevance")
print(result_mmr)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]Invalid JSON response. Expected dictionary with key 'Attributed'
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  6.82it/s]


Marginal relevance
  System  Context Precision  Context Recall  Accuracy
0    MMR           0.791667        0.833333       0.2


#### BM25

In [37]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 500, chunk_overlap = 50)
chunks_500 = text_splitter.split_documents(documents)

In [38]:
retriever_bm25 = BM25Retriever.from_documents(chunks_500)
result_bm25, results_df = run_and_evaluate(f"BM25", retriever_bm25, prompt, llm, results_df)
print("BM25")
print(result_bm25)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  20%|██        | 4/20 [00:01<00:03,  4.64it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'sever

BM25
  System  Context Precision  Context Recall  Accuracy
0   BM25           0.055556             0.3       0.0


#### Ensambler - Hybrid

In [40]:
ret = db_k.as_retriever()

In [41]:
ensemble_retriever_1 = EnsembleRetriever(retrievers=[retriever_bm25, ret], weights=[0.75, 0.25])
result_ensemble1, results_df = run_and_evaluate(f"Ensambler 1", ensemble_retriever_1, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble1)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   5%|▌         | 1/20 [00:01<00:21,  1.13s/it]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'sever

Ensambler
        System  Context Precision  Context Recall  Accuracy
0  Ensambler 1           0.288845             0.9       0.1


In [42]:
ensemble_retriever_2 = EnsembleRetriever(retrievers=[retriever_bm25, ret], weights=[0.5, 0.5])
result_ensemble2, results_df = run_and_evaluate(f"Ensambler 2", ensemble_retriever_2, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble2)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  40%|████      | 8/20 [00:01<00:01,  7.46it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'sever

Ensambler
        System  Context Precision  Context Recall  Accuracy
0  Ensambler 2           0.566027             0.8       0.0


In [43]:
ensemble_retriever_3 = EnsembleRetriever(retrievers=[retriever_bm25, ret], weights=[0.25,0.75])
result_ensemble3, results_df = run_and_evaluate(f"Ensambler 3", ensemble_retriever_3, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble3)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  50%|█████     | 10/20 [00:02<00:01,  5.44it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'seve

Ensambler
        System  Context Precision  Context Recall  Accuracy
0  Ensambler 3            0.82037             0.7       0.2


#### Multi-stage - reranker

In [None]:
os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")

In [None]:
retriever_context = retriever_6
compressor = CohereRerank(top_n = 3)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever_context
)

result_compression, acc_compression = run_and_evaluate(compression_retriever, prompt, llm)
print("Reranker")
print(result_compression)
print("AVG accuracy "+ str(acc_compression))

#### creating context by remaking the query

In [44]:
highest = results_df.nlargest(1, "Accuracy")
highest

Unnamed: 0,System,Context Precision,Context Recall,Accuracy
6,"Chunk 500, overlap 10%",0.816667,0.9,0.4


In [45]:
template_context = "Generate a search query to fetch the relevant documents using the user's {question}. Craft a query that specifically targets the keywords in the question. In the answer provide only the query."
prompt_context = ChatPromptTemplate.from_template(template_context)

In [46]:
answers_final = []
contexts_final = []
# retriever_context_q = EnsembleRetriever(retrievers=[retriever_bm25, retriever_3], weights=[0.5, 0.5])
llm_for_context =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt_context | llm}
)
for query in questions:
    response_check = llm_for_context.invoke({"question": query})
    search_query = response_check["response"].content
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("context"), "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
)
    docs = ret.get_relevant_documents(search_query)
    formatted_docs = []
    for doc in docs:
        resulting_doc = doc.page_content
        formatted_docs.append(resulting_doc)
    try:  
            response = retrieval_augmented_qa_chain.invoke({"context": formatted_docs, "question": query})
            # Access the response content
            answers_final.append(response["response"].content)
            contexts_final.append(formatted_docs)  
    except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_final.append("No answer")
            contexts_final.append(formatted_docs)


result_search_query = evaluation_rag(questions, answers_final, contexts_final, ground_truths)
result_search_query

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  6.97it/s]


{'context_precision': 0.8028, 'context_recall': 0.9000}

In [47]:
accuracy = calculate_accuracy(answers_final, ground_truths)
    # Create a dictionary to store the results
system_results = {
        "System": "Search query",
        "Context Precision": result_search_query["context_precision"],
        "Context Recall": result_search_query["context_recall"],
        "Accuracy": accuracy
    }
df_result_search_query = pd.DataFrame([system_results])

In [48]:
results_df = pd.concat([results_df, df_result_search_query], ignore_index=True)
results_df

Unnamed: 0,System,Context Precision,Context Recall,Accuracy
0,GPT-3.5,,,0.0
1,GPT-4,,,0.3
2,Naive,0.825,1.0,0.1
3,Recursive,0.816667,1.0,0.1
4,"Chunk 500, overlap 0%",0.8,1.0,0.2
5,"Chunk 500, overlap 5%",0.808333,1.0,0.2
6,"Chunk 500, overlap 10%",0.816667,0.9,0.4
7,"Chunk 500, overlap 15%",0.872222,0.9,0.4
8,"Chunk 500, overlap 20%",0.844444,0.966667,0.3
9,"Chunk 1000, overlap 0%",0.880556,1.0,0.2


### change model to GPT-4

In [50]:
top_3 = results_df.nlargest(4, "Accuracy")
top_3

Unnamed: 0,System,Context Precision,Context Recall,Accuracy
6,"Chunk 500, overlap 10%",0.816667,0.9,0.4
7,"Chunk 500, overlap 15%",0.872222,0.9,0.4
1,GPT-4,,,0.3
8,"Chunk 500, overlap 20%",0.844444,0.966667,0.3


In [51]:
result_ret_gpt4, results_df = run_and_evaluate(f"Chunk 500, overlap 10%, GPT-4", ret, prompt, llm_gpt4, results_df)
result_ret_gpt4

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  7.58it/s]


Unnamed: 0,System,Context Precision,Context Recall,Accuracy
0,"Chunk 500, overlap 5%, GPT-4",0.816667,0.9,0.4


In [52]:
db_k_extra = Chroma(persist_directory = "../papers/vectordb-edit/chunking_500_15", embedding_function=embeddings_client)
retriever_15 = db_k_extra.as_retriever()
result_retriever_15_gpt4, results_df = run_and_evaluate(f"Chunk size 500, overlap 15%, K=3, GPT-4", retriever_15, prompt, llm_gpt4, results_df)
result_retriever_15_gpt4

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:03<00:00,  6.65it/s]


Unnamed: 0,System,Context Precision,Context Recall,Accuracy
0,"Chunk size 500, overlap 15%, K=3, GPT-4",0.897222,1.0,0.2


In [53]:
db_k_extra_20 = Chroma(persist_directory = "../papers/vectordb-edit/chunking_500_20", embedding_function=embeddings_client)
retriever_20 = db_k_extra_20.as_retriever()
result_retriever_20_gpt4, results_df = run_and_evaluate(f"Chunk size 500, overlap 20%, K=3, GPT-4", retriever_20, prompt, llm_gpt4, results_df)
result_retriever_20_gpt4

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  6.92it/s]


Unnamed: 0,System,Context Precision,Context Recall,Accuracy
0,"Chunk size 500, overlap 20%, K=3, GPT-4",0.838889,0.966667,0.4


In [54]:
replacement_map = {
    'Chunk 500, overlap 5%, GPT-4': 'Chunk 500, overlap 10%, GPT-4'
}
results_df['System'] = results_df['System'].replace(replacement_map)

In [55]:
results_df

Unnamed: 0,System,Context Precision,Context Recall,Accuracy
0,GPT-3.5,,,0.0
1,GPT-4,,,0.3
2,Naive,0.825,1.0,0.1
3,Recursive,0.816667,1.0,0.1
4,"Chunk 500, overlap 0%",0.8,1.0,0.2
5,"Chunk 500, overlap 5%",0.808333,1.0,0.2
6,"Chunk 500, overlap 10%",0.816667,0.9,0.4
7,"Chunk 500, overlap 15%",0.872222,0.9,0.4
8,"Chunk 500, overlap 20%",0.844444,0.966667,0.3
9,"Chunk 1000, overlap 0%",0.880556,1.0,0.2


In [56]:
results_df.to_csv(f"../papers/results/results_errors.csv", index=False)