In [1]:
import dotenv
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain_openai import AzureOpenAIEmbeddings
dotenv.load_dotenv()
from langchain_community.vectorstores import Chroma
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_similarity,
    answer_correctness,
)
from datasets import Dataset
from langchain_openai import AzureChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
import os
import pandas as pd
import numpy as np
import sys
sys.tracebacklimit = 0
from langchain_community.document_loaders import PyPDFLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION")
AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
OPENAI_MODEL = os.environ.get("OPENAI_MODEL")
OPENAI_DEPLOYMENT = os.environ.get("OPENAI_DEPLOYMENT")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL")
EMBEDDING_DEPLOYMENT = os.environ.get("EMBEDDING_DEPLOYMENT")
OPENAI_MODEL_GPT4 = os.environ.get("OPENAI_MODEL_GPT4")
OPENAI_DEPLOYMENT_GPT4 = os.environ.get("OPENAI_DEPLOYMENT_GPT4")

In [3]:
embeddings_client = AzureOpenAIEmbeddings(
    azure_deployment=EMBEDDING_DEPLOYMENT,
    openai_api_version=OPENAI_API_VERSION)
llm = AzureChatOpenAI(model_name=OPENAI_MODEL, azure_deployment=OPENAI_DEPLOYMENT,temperature=0)
llm_gpt4 = AzureChatOpenAI(model_name=OPENAI_MODEL_GPT4, azure_deployment=OPENAI_DEPLOYMENT_GPT4,temperature=0)

In [4]:
def evaluation_llm(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
    )
    return result

In [6]:
def evaluation_rag(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  # most likely
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            context_precision,
            context_recall,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
        raise_exceptions=False,
    )
    return result

In [7]:
columns = ["System", "Faithfulness", "Answer Relevancy", "Context Precision", "Context Recall", "Answer Similarity", "Answer Correctness"]
results_df = pd.DataFrame(columns=columns)

In [8]:
max_average = 0
def find_highest(average_score):
    global max_average
    if average_score > max_average:
        max_average = average_score
        print("This is the new best value!")

In [9]:
def dictionary(result):
    dict_result = dict(result)
    average_score = sum(dict_result.values()) / len(dict_result)
    print(f"The average score is: {average_score}")
    find_highest(average_score)
    return average_score

In [10]:
def evaluate_system(system_name, questions, answers, contexts, ground_truths):
    result = evaluation_rag(questions, answers, contexts, ground_truths)
    average = dictionary(result)
    # Create a dictionary to store the results
    system_results = {
        "System": system_name,
        "Faithfulness": result["faithfulness"],
        "Answer Relevancy": result["answer_relevancy"],
        "Context Precision": result["context_precision"],
        "Context Recall": result["context_recall"],
        "Answer Similarity": result["answer_similarity"],
        "Answer Correctness": result["answer_correctness"],
        "Average": average
    }
    df_system_results = pd.DataFrame([system_results])
    return df_system_results

In [11]:
def evaluate_LLM(system_name, questions, answers, contexts, ground_truths):
    result = evaluation_rag(questions, answers, contexts, ground_truths)
    average = dictionary(result)
    # Create a dictionary to store the results
    system_results = {
        "System": system_name,
        "Faithfulness": result["faithfulness"],
        "Answer Relevancy": result["answer_relevancy"],
        "Context Precision": np.nan,
        "Context Recall": np.nan,
        "Answer Similarity": result["answer_similarity"],
        "Answer Correctness": result["answer_correctness"],
        "Average": average
    }
    df_llm_results = pd.DataFrame([system_results])
    return df_llm_results

In [12]:
questions = [
    "What is the paper by Chen et al. about?",
    "What is the main focus and contribution of the paper by Cheonsu Jeong regarding generative AI technology, specifically in addressing information scarcity, and what is the significance of the proposed Retrieval-Augmented Generation (RAG) model in improving content generation within the Large Language Models (LLM) application architecture?",
    "What are the conclusions of CRUD-RAG: A Comprehensive Chinese Benchmark for Retrieval-Augmented Generation of Large Language Models?",
    "How does the proposed approach in the 'Efficient Retrieval Augmented Generation from Unstructured Knowledge for Task-Oriented Dialog' address the computational complexity issue in knowledge selection and what methods are employed in response generation to consider multiple knowledge snippets?",
    "What challenges and potential applications are discussed for large language models like GPT in the context of automated form completion and knowledge-intensive tasks, and what innovative methods are proposed to address these challenges in each scenario?",
    "Explain Metacognitive RAG",
    "What is the HybridRAG framework and its main components, and how does it aim to enhance the performance of a client-based language model?",
    "What are the reasons for using GPT-4 over GPT-3.5?",
    "What are the key challenges in the implementation of RAG systems, and how do the authors of 'THE CHRONICLES OF RAG: THE RETRIEVER, THE CHUNK AND THE GENERATOR' propose to address them, particularly focusing on retriever quality, input size optimization, and overall performance evaluation?",
    "Why is RAG preferred over general LLMs?"
]

ground_truths = [
    ["The research paper 'Benchmarking Large Language Models in Retrieval-Augmented Generation' evaluates the abilities of Large Language Models (LLMs) in utilizing retrieved information for generation tasks. It introduces the Retrieval-Augmented Generation Benchmark (RGB) to assess LLMs' capabilities in noisy environments, negative rejection scenarios, information integration, and counterfactual robustness. The paper aims to comprehensively evaluate LLMs' performance in effectively utilizing external information and overcoming challenges in information retrieval."],
    ["The research paper focuses on implementing generative AI services using the Large Language Models (LLM) application architecture. The main contributions of the paper include addressing information scarcity within LLM by proposing specific remedies using fine-tuning techniques and direct document integration. The significant contribution is the development of the Retrieval-Augmented Generation (RAG) model, which enhances information storage and retrieval processes, leading to improved content generation within enterprises utilizing LLMs. The RAG model combines information retrieval with text generation, improving the accuracy and reliability of generated content, specifically addressing challenges related to data insufficiency within LLMs."],
    ["The paper introduces a comprehensive framework, CRUD-RAG, for evaluating retrieval-augmented generation (RAG) systems in text generation tasks. The framework categorizes tasks into CRUD types (Create, Read, Update, Delete), providing a structured assessment of RAG capabilities. Large-scale datasets for each CRUD category are created to evaluate RAG performance under various conditions. Experimental comparisons demonstrate that RAG systems significantly improve content quality by incorporating information from external sources. The study emphasizes the importance of fine-tuning RAG systems, addressing factors such as retrieval model optimization, context length, knowledge base construction, and large language model deployment. The insights aim to guide researchers and practitioners, offering a roadmap for the development and refinement of RAG systems. The paper envisions stimulating further exploration and innovation in RAG technologies to advance text generation applications and enhance the integration of retrieval mechanisms and language models for more intelligent and context-aware generative systems."],
    ["The proposed approach addresses the computational complexity issue in knowledge selection by introducing the Hierarchical Selection method. This method facilitates the reduction of computational cost by splitting the task of classifying a relevant document into subsequent stages. By first identifying the relevant domain and entity before selecting the relevant snippet among the documents of that entity, the approach is able to significantly reduce the computational cost. In response generation, the approach employs Retrieval Augmented Generation (RAG) to consider multiple knowledge snippets. By using RAG, the model is able to generate responses based on multiple selected snippets, allowing for a more comprehensive and informed generation process. This method incorporates the knowledge from multiple sources to enhance the response generation process. Additionally, the knowledge selection process is optimized through the use of metadata, and two variants are presented: one involving three separate models for domain, entity, and document retrieval, and the other involving joint retrieval of domain and entity. This reduces the computational complexity from O(|K|) to O(|D| + |E|) and O(|E| + |K|) for the first and second variants, respectively. The response generation task is formulated as a marginalization over selected knowledge snippets, enabling the consideration of multiple snippets in generating responses and addressing potential errors introduced by a hard decision in the selection step. The method involves a probability distribution over knowledge snippets given a dialog context and includes both selection and generation probabilities."],
    ["The challenges and potential applications discussed for large language models like GPT include their remarkable natural language processing capabilities in tasks such as automated form completion and knowledge-intensive tasks like open-domain question answering (QA), fact checking, and dialogue systems. In the context of automated form completion, challenges include understanding form layout, guidelines, user intent, and reasoning over data. The proposed method involves creating a knowledge base, augmenting the language model with the knowledge base using retrieval-augmented generation, and using prompt engineering techniques. In knowledge-intensive tasks, the text introduces a generate-then-read (GENREAD) method, replacing document retrievers with large language model generators. Additionally, the Hybrid Retrieval-Augmented Generation (HybridRAG) framework is proposed to efficiently combine a cloud-based LLM with a smaller client-side language model. Another aspect is the introduction of MetaRAG, which combines retrieval-augmented generation with metacognition to enhance introspective reasoning abilities and improve response strategies through a three-step metacognitive regulation pipeline."],
    ["Metacognitive Retrieval-Augmented Generation (MetaRAG) is a framework for enhancing retrieval-augmented generation processes in large language models through the integration of metacognition. MetaRAG combines the capabilities of retrieval systems with metacognitive abilities inspired by human cognitive processes. It allows the model to monitor, evaluate, and plan its response strategies, enabling introspective reasoning and self-awareness in the cognitive process. This approach aims to identify and rectify errors or shortcomings in the model's reasoning, ultimately leading to more accurate and precise answer derivation."],
    ["The HybridRAG framework comprises an augmentation coordinator (client), a memory-augmented client model (client), a retriever model (cloud), and a language model (LLM)-based memory generator (cloud). The augmentation coordinator monitors writing context and requests augmented memory from the cloud based on context changes. The retriever model searches for relevant data in a retrieval corpus using Dense Passage Retrieval. The LLM-based memory generator generates concise key takeaways from retrieved documents, optimizing memory size. The memory is integrated into the client model to enhance its overall performance. The client model is fine-tuned using instruction-based prompts and LLM-generated references to effectively leverage cloud-generated memory."],
    ["Enhanced Performance: GPT-4 surpasses GPT-3.5 in various tasks, particularly excelling in question-answering scenarios that demand reasoning across multiple documents. Advanced Knowledge Integration: GPT-4 exhibits remarkable proficiency in fusing knowledge, especially when a comprehensive understanding of information from diverse documents is imperative. This underscores its superior capability in handling intricate tasks. Efficient Text Refinement: GPT-4 effectively refines and corrects text in scenarios requiring modification, maintaining accuracy and content relevance while addressing errors or hallucinations in the text. Concise Summary Production: GPT-4 generates concise summaries by eliminating redundant information, showcasing its efficiency in creating informative and succinct content. Robust Performance: While GPT-4 stands out as the preferred choice due to its overall outstanding performance, models like Qwen-7B and Qwen-14B also demonstrate competitive results in tasks such as text continuation, summary generation, and question answering."],
    ["The challenges in RAG system implementation include effective integration of retrieval models, representation learning efficiency, diverse data handling, computational efficiency optimization, and quality evaluation. The article proposes best practices for implementing RAG on a Brazilian Portuguese dataset, emphasizing a simplified pipeline. Key findings include a 35.4% improvement in MRR@10 with retriever quality enhancement, a 2.4% performance boost through input size optimization, and a final accuracy of 98.61%, marking a 40.73% improvement compared to the baseline. The study underscores the importance of data quality in input, retriever, and evaluation, providing insights into formulating queries, understanding information retrieval behavior, and defining evaluation metrics. The future work involves exploring additional datasets for segmentation and chunk construction techniques."],
    ["RAG (Retrieval Augmented Generation) is preferred over regular Large Language Models (LLMs) due to its ability to access external data, address training limitations, and mitigate hallucinations by integrating retrieved information. It leverages an effective retriever pipeline for coherent responses, enhancing text generation quality with relevant external information. RAG employs external knowledge sources, frameworks like LlamaIndex and LangChain, and metacognitive abilities to improve response credibility, accuracy, and reasoning over multiple evidence pieces. It offers benefits such as enhanced utility, low latency through HybridRAG, reduced client-to-cloud communication, and efficient memory integration. In form filling, RAG accesses additional information for accurate responses, correcting errors, and conditioning response generation on selected knowledge snippets. It mitigates issues in LLMs, such as factual errors and outdated knowledge, providing a promising solution by integrating external knowledge for more reliable and coherent responses."]
]

### General answer by LLMs

In [13]:
template = """{question}"""
prompt = ChatPromptTemplate.from_template(template)

In [13]:
llm_chain =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm}
)
llm_chain_gpt4 =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm_gpt4}
)

In [14]:
answers_llm = []
contexts_llm = [[""],[""],[""],[""],[""],[""],[""],[""],[""],[""]]

In [15]:
for query in questions:
    response = llm_chain.invoke({"question": query})
    answers_llm.append(response["response"].content)

In [16]:
llm_results = evaluate_LLM("GPT-3.5", questions, answers_llm, contexts_llm, ground_truths)
results_df = pd.concat([results_df, llm_results], ignore_index=True)
print(llm_results)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:07<00:00,  8.22it/s]


The average score is: 0.7011075175058377
This is the new best value!
    System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  GPT-3.5           1.0          0.743175                NaN             NaN   

   Answer Similarity  Answer Correctness   Average  
0           0.915695            0.487775  0.701108  


  results_df = pd.concat([results_df, llm_results], ignore_index=True)


In [17]:
answers_llm_gpt4 = []
for query in questions:
    response = llm_chain_gpt4.invoke({"question": query})
    answers_llm_gpt4.append(response["response"].content)
llm_results_gpt4 = evaluate_LLM("GPT-4",questions, answers_llm_gpt4, contexts_llm, ground_truths)
results_df = pd.concat([results_df, llm_results_gpt4], ignore_index=True)
print(llm_results_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:07<00:00,  7.98it/s]


The average score is: 0.6902620942552353
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  GPT-4      0.971429          0.723279                NaN             NaN   

   Answer Similarity  Answer Correctness   Average  
0           0.902019            0.484846  0.690262  


# RAG

### Naive RAG

In [14]:
template = """User input {question}. 
Context {context}."""
prompt = ChatPromptTemplate.from_template(template)

In [47]:
def retrieval_chain(prompt, retriever, llm):
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
    )
    return retrieval_augmented_qa_chain

In [15]:
loader = DirectoryLoader('../papers', glob="./*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [21]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
# text_splitter = CharacterTextSplitter()
# chunks = text_splitter.split_documents(documents)
# db_naive = Chroma.from_documents(chunks, embeddings_client, persist_directory = "../papers/vectordb-edit/naive")
# db_naive.persist()
# retriever_naive = db_naive.as_retriever()

In [22]:
db_naive = Chroma(persist_directory = "../papers/vectordb-edit/naive", embedding_function=embeddings_client)
retriever_naive = db_naive.as_retriever()

In [23]:
answers_naive = []
contexts_naive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_naive, llm).invoke({"question": query})
        # Access the response content
        answers_naive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_naive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_naive.append("No answer")
        context_full = retriever_naive.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_naive.append(context_content)

In [24]:
result_naive_rag = evaluate_system("Naive", questions, answers_naive, contexts_naive, ground_truths)
results_df = pd.concat([results_df, result_naive_rag], ignore_index=True)
print(result_naive_rag)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  30%|███       | 18/60 [00:05<00:09,  4.44it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.23it/s]


The average score is: 0.8784985152727972
This is the new best value!
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  Naive      0.686914          0.964286           0.874123        0.905556   

   Answer Similarity  Answer Correctness   Average  
0           0.930638            0.909474  0.878499  


### recursive splitter

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
# text_splitter = text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder()
# chunks_r = text_splitter.split_documents(documents)
# db_basic = Chroma.from_documents(chunks_r, embeddings_client, persist_directory = "../papers/vectordb-edit/recursive_basic")
# db_basic.persist()
# retriever_basic = db_basic.as_retriever()

In [25]:
db_basic = Chroma(persist_directory = "../papers/vectordb-edit/recursive_basic", embedding_function=embeddings_client)
retriever_basic = db_basic.as_retriever()

In [26]:
answers_recursive = []
contexts_recursive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_basic, llm).invoke({"question": query})
        # Access the response content
        answers_recursive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_recursive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_recursive.append("No answer")
        context_full = retriever_basic.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_recursive.append(context_content)

In [27]:
result_recursive = evaluate_system("Recursive", questions, answers_naive, contexts_naive, ground_truths)
results_df = pd.concat([results_df, result_recursive], ignore_index=True)
result_recursive

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  18%|█▊        | 11/60 [00:05<00:18,  2.61it/s]Task exception was never retrieved
future: <Task finished name='Task-253' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-254' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Evaluating:  20%|██        | 12/60 [00:05<00:17,  2.79it/s]Invalid JSON response. Expected dictionary wit

The average score is: 0.8775262827504023


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,Recursive,0.669849,0.964286,0.885355,0.905556,0.930638,0.909474,0.877526


### chunk sizes

In [16]:
def run_and_evaluate(name, retriever, prompt, llm, results_df):
    answers = []
    contexts_extra = []

    for query in questions:
        try:  
            response = retrieval_chain(prompt, retriever, llm).invoke({"question": query})
            # Access the response content
            answers.append(response["response"].content)
            # Access the context content
            context_content = [context.page_content for context in response["context"]]
            contexts_extra.append(context_content)  
        except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers.append("No answer")
            context_full = retriever.get_relevant_documents(query)
            context_content = [context.page_content for context in context_full]
            contexts_extra.append(context_content)

    result = evaluate_system(name, questions, answers, contexts_extra, ground_truths)
    results_df = pd.concat([results_df, result], ignore_index=True)
    return result, results_df

In [29]:
chunk_sizes = [500, 1000, 2000, 3000]
overlap_percentages = [0, 5, 10, 15, 20]

for chunk_size in chunk_sizes:
    for overlap_percentage in overlap_percentages:
        # Calculate overlap based on percentage
        chunk_overlap = int(chunk_size * overlap_percentage / 100)
        
        # Create text splitter
        # text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        
        # # Split documents
        # chunks = text_splitter.split_documents(documents)
        
        # Print number of chunks
        print(f"Number of chunks for chunk size {chunk_size}, overlap {overlap_percentage}%")
        
        # Create Chroma database
        # db = Chroma.from_documents(chunks, embeddings_client, persist_directory=f"../papers/vectordb-edit/chunking_{chunk_size}_{overlap_percentage}")
        db = Chroma(persist_directory = f"../papers/vectordb-edit/chunking_{chunk_size}_{overlap_percentage}", embedding_function=embeddings_client)
        db.persist()
        
        # Create retriever
        retriever = db.as_retriever()
        
        # Run and evaluate
        result,results_df = run_and_evaluate(f"Chunk {chunk_size}, overlap {overlap_percentage}%", retriever, prompt, llm, results_df)
        print(f"CHUNK SIZE {chunk_size}, {overlap_percentage}% overlap:")
        print(result)

Number of chunks for chunk size 500, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  23%|██▎       | 14/60 [00:04<00:16,  2.87it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.22it/s]


The average score is: 0.8478294359303452
CHUNK SIZE 500, 0% overlap:
                  System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 0%      0.953704          0.806519           0.805556   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0             0.9            0.93744            0.683758  0.847829  
Number of chunks for chunk size 500, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  72%|███████▏  | 43/60 [00:05<00:01, 15.55it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.65it/s]


The average score is: 0.8932811010205719
This is the new best value!
CHUNK SIZE 500, 5% overlap:
                  System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 5%      0.944444          0.933492           0.891667   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0             0.9           0.947441            0.742642  0.893281  
Number of chunks for chunk size 500, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  23%|██▎       | 14/60 [00:04<00:16,  2.75it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  6.89it/s]


The average score is: 0.8524234424988394
CHUNK SIZE 500, 10% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 10%      0.705793          0.974965            0.80458   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.872222           0.875755            0.881225  0.852423  
Number of chunks for chunk size 500, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-490' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-491' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-492' coro=<AsyncClient.aclose() done, defined

The average score is: 0.8761099363291128
CHUNK SIZE 500, 15% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 15%      0.752637          0.907425           0.939409   

   Context Recall  Answer Similarity  Answer Correctness  Average  
0            0.85            0.98129            0.825899  0.87611  
Number of chunks for chunk size 500, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  23%|██▎       | 14/60 [00:04<00:16,  2.74it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating:  27%|██▋       | 16/60 [00:05<00:13,  3.16it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  6.94it/s]


The average score is: 0.8752834041915757
CHUNK SIZE 500, 20% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 20%      0.984127          0.931073                0.8   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0           0.875           0.938601            0.722899  0.875283  
Number of chunks for chunk size 1000, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.14it/s]


The average score is: 0.855402338615498
CHUNK SIZE 1000, 0% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 0%           1.0          0.811036           0.858333   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0          0.8675           0.927681            0.667863  0.855402  
Number of chunks for chunk size 1000, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.62it/s]


The average score is: 0.854468635603447
CHUNK SIZE 1000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 5%        0.8775           0.89488           0.855556   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.848889           0.948182            0.701805  0.854469  
Number of chunks for chunk size 1000, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:07<00:00,  7.70it/s]


The average score is: 0.8163619543706294
CHUNK SIZE 1000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 10%         0.875          0.724113           0.836111   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.863333           0.927738            0.671876  0.816362  
Number of chunks for chunk size 1000, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  23%|██▎       | 14/60 [00:05<00:17,  2.68it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  7.06it/s]


The average score is: 0.8387833135494179
CHUNK SIZE 1000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 15%      0.687556           0.85145           0.853393   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0           0.825            0.94969            0.865611  0.838783  
Number of chunks for chunk size 1000, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.51it/s]


The average score is: 0.8426677663390895
CHUNK SIZE 1000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 20%      0.920635          0.797818           0.822222   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0             0.9           0.936076            0.679255  0.842668  
Number of chunks for chunk size 2000, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  33%|███▎      | 20/60 [00:05<00:06,  5.74it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  7.19it/s]


The average score is: 0.8699671378021915
CHUNK SIZE 2000, 0% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 0%      0.679827          0.942857            0.88572   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.922222           0.870638            0.918538  0.869967  
Number of chunks for chunk size 2000, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  7.34it/s]


The average score is: 0.8794751429774427
CHUNK SIZE 2000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 5%           1.0          0.880269           0.897222   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0           0.835           0.937845            0.726514  0.879475  
Number of chunks for chunk size 2000, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  35%|███▌      | 21/60 [00:05<00:07,  5.16it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.22it/s]


The average score is: 0.8970455466967152
This is the new best value!
CHUNK SIZE 2000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 10%      0.716439               1.0           0.877715   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.938889           0.930638            0.918592  0.897046  
Number of chunks for chunk size 2000, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  28%|██▊       | 17/60 [00:05<00:09,  4.32it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:07<00:00,  7.89it/s]


The average score is: 0.8802579176463924
CHUNK SIZE 2000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 15%      0.701334               1.0           0.891013   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.880556           0.890638            0.918007  0.880258  
Number of chunks for chunk size 2000, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  18%|█▊        | 11/60 [00:04<00:18,  2.59it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating:  25%|██▌       | 15/60 [00:04<00:11,  3.97it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.14it/s]


The average score is: 0.8739585054178122
CHUNK SIZE 2000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 20%      0.815049          0.826622           0.906254   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.928893            0.85286            0.914072  0.873959  
Number of chunks for chunk size 3000, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  20%|██        | 12/60 [00:05<00:15,  3.12it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:11<00:00,  5.31it/s]


The average score is: 0.8644635704772573
CHUNK SIZE 3000, 0% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 0%       0.69633               1.0           0.796392   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.922222           0.855638            0.916199  0.864464  
Number of chunks for chunk size 3000, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  18%|█▊        | 11/60 [00:05<00:20,  2.40it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.58it/s]


The average score is: 0.8405938442566869
CHUNK SIZE 3000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 5%      0.697668          0.982143           0.798822   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.822222           0.830638             0.91207  0.840594  
Number of chunks for chunk size 3000, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  33%|███▎      | 20/60 [00:05<00:07,  5.24it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  7.31it/s]


The average score is: 0.8963464067484345
CHUNK SIZE 3000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 10%      0.712127               1.0            0.86971   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.947222           0.930638            0.918381  0.896346  
Number of chunks for chunk size 3000, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-1421' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-1422' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-1423' coro=<AsyncClient.aclose() done, defi

The average score is: 0.8572789644008911
CHUNK SIZE 3000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 15%      0.690314               1.0           0.804194   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.905556           0.830638            0.912973  0.857279  
Number of chunks for chunk size 3000, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  33%|███▎      | 20/60 [00:05<00:08,  4.95it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.64it/s]


The average score is: 0.8377334742206918
CHUNK SIZE 3000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 20%      0.671085               1.0            0.79439   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.797222           0.850638            0.913066  0.837733  


In [17]:
results_df = pd.read_csv(f"../papers/results/results_summarize.csv")

chunk = 1000

In [None]:
# # THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 100)
# chunks_1000 = text_splitter.split_documents(documents)
# print(len(chunks_1000))
# db_1000 = Chroma.from_documents(chunks_1000, embeddings_client, persist_directory = "../papers/vectordb-edit/recursive_1000")
# db_1000.persist()

In [None]:
# db_1000 = Chroma(persist_directory = "../papers/vectordb-edit/recursive_1000", embedding_function=embeddings_client)

In [None]:
# retriever_1000 = db_1000.as_retriever()
# result_1000 = run_and_evaluate(retriever_1000, prompt, llm)
# print("CHUNK SIZE 1000")
# print(result_1000)

In [None]:
# dict_result_1000 = dictionary(result_1000)

chunk = 500

In [None]:
# # THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 500, chunk_overlap = 50)
# chunks_500 = text_splitter.split_documents(documents)
# print(len(chunks_500))
# db_500 = Chroma.from_documents(chunks_500, embeddings_client, persist_directory = "../papers/vectordb-edit/recursive_500")
# db_500.persist()

In [None]:
# db_500 = Chroma(persist_directory = "../papers/vectordb-edit/recursive_500", embedding_function=embeddings_client)

In [None]:
# retriever_500 = db_500.as_retriever()
# result_500 = run_and_evaluate(retriever_500, prompt, llm)
# print("CHUNK SIZE 500")
# print(result_500)

In [None]:
# dict_result_500 = dictionary(result_500)

chunk = 2000

In [None]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 2000, chunk_overlap = 200)
# chunks_2000 = text_splitter.split_documents(documents)
# print(len(chunks_2000))
# db_2000 = Chroma.from_documents(chunks_2000, embeddings_client, persist_directory = "../papers/vectordb-edit/recursive_2000")


In [None]:
# db_2000 = Chroma(persist_directory = "../papers/vectordb-edit/recursive_2000", embedding_function=embeddings_client)

In [None]:
# retriever_2000 = db_2000.as_retriever()
# result_2000 = run_and_evaluate(retriever_2000, prompt, llm)
# print("CHUNK SIZE 2000")
# print(result_2000)

In [None]:
# dict_result_2000 = dictionary(result_2000)

chunk = 3000

In [None]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 3000, chunk_overlap = 300)
# chunks_3000 = text_splitter.split_documents(documents)
# print(len(chunks_3000))
# db_3000 = Chroma.from_documents(chunks_3000, embeddings_client, persist_directory = "../papers/vectordb-edit/recursive_3000")

In [None]:
# db_3000 = Chroma(persist_directory = "../papers/vectordb-edit/recursive_3000", embedding_function=embeddings_client)

In [None]:
# retriever_3000 = db_3000.as_retriever()
# result_3000 = run_and_evaluate(retriever_3000, prompt, llm)
# print("CHUNK SIZE 3000")
# print(result_3000)

In [None]:
# dict_result_3000 = dictionary(result_3000)

In [30]:
highest = results_df.nlargest(1, "Average")
highest

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
16,"Chunk 2000, overlap 10%",0.716439,1.0,0.877715,0.938889,0.930638,0.918592,0.897046


In [32]:
results_df.to_csv(f"../papers/results/results_summarize.csv", index=False)

### now time to look for different top-k

Note: We continue with the size chunk of 1000 as it had the highest average score

In [24]:
db_k = Chroma(persist_directory = "../papers/vectordb-edit/chunking_2000_10", embedding_function=embeddings_client)

In [34]:
k_values = [2, 3, 5, 6, 7]

# Iterate over different k values
for k in k_values:
    # Create retriever with k value
    retriever = db_k.as_retriever(search_kwargs={"k": k})
    
    # Run and evaluate
    result,results_df = run_and_evaluate(f"Chunk size 2000, overlap 10%, K={k}", retriever, prompt, llm, results_df)
    print(f"Results for K={k}:")
    print(result)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   2%|▏         | 1/60 [00:02<02:17,  2.33s/it]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.40it/s]


The average score is: 0.882500889118932
Results for K=2:
                              System  Faithfulness  Answer Relevancy  \
0  Chunk size 2000, overlap 10%, K=2      0.937241          0.823195   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.973148            0.92           0.935414            0.706006   

    Average  
0  0.882501  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  6.97it/s]


The average score is: 0.9186200201759759
This is the new best value!
Results for K=3:
                              System  Faithfulness  Answer Relevancy  \
0  Chunk size 2000, overlap 10%, K=3      0.946429           0.91532   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.966667             1.0           0.943323            0.739982   

   Average  
0  0.91862  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:11<00:00,  5.42it/s]


The average score is: 0.9114472045250462
Results for K=5:
                              System  Faithfulness  Answer Relevancy  \
0  Chunk size 2000, overlap 10%, K=5           1.0          0.849876   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.902917             1.0            0.94531            0.770581   

    Average  
0  0.911447  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  17%|█▋        | 10/60 [00:02<00:09,  5.42it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 8268 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
Evaluating:  25%|██▌       | 15/60 [00:04<00:13,  3.32it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating:  97%|█████████▋| 58/60 [00:12<00:00,  2.45it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resu

The average score is: 0.8800784633872859
Results for K=6:
                              System  Faithfulness  Answer Relevancy  \
0  Chunk size 2000, overlap 10%, K=6      0.857163          0.901688   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.923695        0.910616           0.878918             0.80839   

    Average  
0  0.880078  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 8416 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
Evaluating:   2%|▏         | 1/60 [00:00<00:13,  4.29it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 8656 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_ex

The average score is: 0.8673777310114494
Results for K=7:
                              System  Faithfulness  Answer Relevancy  \
0  Chunk size 2000, overlap 10%, K=7      0.981916          0.833274   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.852652            0.95           0.905625            0.680799   

    Average  
0  0.867378  


In [None]:
# retriever_3 = db_1000.as_retriever(search_kwargs={"k": 3})
# result_3 = run_and_evaluate(retriever_3, prompt, llm)
# print("CHUNK SIZE 1000, K=3")
# print(result_3)

In [None]:
# dict_result_3 = dictionary(result_3)

In [None]:
# retriever_5 = db_1000.as_retriever(search_kwargs={"k": 5})
# result_5 = run_and_evaluate(retriever_5, prompt, llm)
# print("CHUNK SIZE 1000, K=5")
# print(result_5)

In [None]:
# dict_result_5 = dictionary(result_5)

In [None]:
# retriever_6= db_1000.as_retriever(search_kwargs={"k": 6})
# result_6 = run_and_evaluate(retriever_6, prompt, llm)
# print("CHUNK SIZE 1000, K=5")
# print(result_6)

In [None]:
# dict_result_6 = dictionary(result_6)

In [None]:
# retriever_7= db_1000.as_retriever(search_kwargs={"k": 7})
# result_7 = run_and_evaluate(retriever_7, prompt, llm)
# print("CHUNK SIZE 1000, K=7")
# print(result_7)
# dict_result_7 = dictionary(result_7)

In [None]:
results_df

### look for different retrievers


#### parent document retriever

In [48]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

parent_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap = 200)
child_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents",persist_directory = "../papers/vectordb-edit/summary-parent", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)
parent_document_retriever.add_documents(documents)
result_parent, results_df = run_and_evaluate(f"Parent Retriever 1000-200", parent_document_retriever, prompt, llm, results_df)
print(result_parent)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  27%|██▋       | 16/60 [00:04<00:10,  4.05it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  6.98it/s]


The average score is: 0.8408850037495385
                      System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 1000-200      0.693712           0.90409   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.764877             0.9           0.955329            0.827303   

    Average  
0  0.840885  


In [49]:
parent_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap = 50)
child_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents_small",persist_directory = "../papers/vectordb-edit/summary-parent_small", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_small = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_small,
    parent_splitter=parent_splitter_small,
)
parent_document_retriever_small.add_documents(documents)
result_parent_small, results_df = run_and_evaluate(f"Parent Retriever 500-100", parent_document_retriever_small, prompt, llm, results_df)
print(result_parent_small)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:31<00:00,  1.91it/s]


The average score is: 0.8675080541266112
                     System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 500-100      0.962963          0.802671   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0                0.9             0.9           0.934762            0.704652   

    Average  
0  0.867508  


In [None]:
parent_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1500, chunk_overlap = 150)
child_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents_large",persist_directory = "../papers/vectordb-edit/summary-parent_large", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_large = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_large,
    parent_splitter=parent_splitter_large,
)
parent_document_retriever_large.add_documents(documents)
result_parent_large , results_df = run_and_evaluate(f"Parent Retriever 1500-200", parent_document_retriever_small, prompt, llm, results_df)
print(result_parent_large)

In [22]:
# results_df.drop(30, inplace = True)
# results_df.drop(31, inplace = True)
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.743175,,,0.915695,0.487775,0.701108
1,GPT-4,0.971429,0.723279,,,0.902019,0.484846,0.690262
2,Naive,0.686914,0.964286,0.874123,0.905556,0.930638,0.909474,0.878499
3,Recursive,0.669849,0.964286,0.885355,0.905556,0.930638,0.909474,0.877526
4,"Chunk 500, overlap 0%",0.953704,0.806519,0.805556,0.9,0.93744,0.683758,0.847829
5,"Chunk 500, overlap 5%",0.944444,0.933492,0.891667,0.9,0.947441,0.742642,0.893281
6,"Chunk 500, overlap 10%",0.705793,0.974965,0.80458,0.872222,0.875755,0.881225,0.852423
7,"Chunk 500, overlap 15%",0.752637,0.907425,0.939409,0.85,0.98129,0.825899,0.87611
8,"Chunk 500, overlap 20%",0.984127,0.931073,0.8,0.875,0.938601,0.722899,0.875283
9,"Chunk 1000, overlap 0%",1.0,0.811036,0.858333,0.8675,0.927681,0.667863,0.855402


In [23]:
highest = results_df.nlargest(1, "Average")
highest

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
25,"Chunk size 2000, overlap 10%, K=3",0.946429,0.91532,0.966667,1.0,0.943323,0.739982,0.91862


#### Maximum marginal relevance retrieval

In [27]:
retriever_mmr = db_k.as_retriever(search_type="mmr",search_kwargs={"k": 3})
result_mmr, results_df = run_and_evaluate(f"MMR", retriever_mmr, prompt, llm, results_df)
print("Marginal relevance")
print(result_mmr)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.65it/s]


The average score is: 0.8378031577041688
This is the new best value!
Marginal relevance
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0    MMR           1.0          0.723678           0.916667             0.8   

   Answer Similarity  Answer Correctness   Average  
0            0.92837            0.658104  0.837803  


In [30]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.743175,,,0.915695,0.487775,0.701108
1,GPT-4,0.971429,0.723279,,,0.902019,0.484846,0.690262
2,Naive,0.686914,0.964286,0.874123,0.905556,0.930638,0.909474,0.878499
3,Recursive,0.669849,0.964286,0.885355,0.905556,0.930638,0.909474,0.877526
4,"Chunk 500, overlap 0%",0.953704,0.806519,0.805556,0.9,0.93744,0.683758,0.847829
5,"Chunk 500, overlap 5%",0.944444,0.933492,0.891667,0.9,0.947441,0.742642,0.893281
6,"Chunk 500, overlap 10%",0.705793,0.974965,0.80458,0.872222,0.875755,0.881225,0.852423
7,"Chunk 500, overlap 15%",0.752637,0.907425,0.939409,0.85,0.98129,0.825899,0.87611
8,"Chunk 500, overlap 20%",0.984127,0.931073,0.8,0.875,0.938601,0.722899,0.875283
9,"Chunk 1000, overlap 0%",1.0,0.811036,0.858333,0.8675,0.927681,0.667863,0.855402


#### BM25

In [32]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 2000, chunk_overlap = 200)
chunks_2000 = text_splitter.split_documents(documents)

In [33]:
from langchain.retrievers import BM25Retriever
retriever_bm25 = BM25Retriever.from_documents(chunks_2000, search_kwargs={"k": 3})
result_bm25, results_df = run_and_evaluate(f"BM25", retriever_bm25, prompt, llm, results_df)
print("BM25")
print(result_bm25)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  6.92it/s]


The average score is: 0.8703536299948803
This is the new best value!
BM25
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0   BM25          0.98          0.899716           0.708333             1.0   

   Answer Similarity  Answer Correctness   Average  
0           0.946193             0.68788  0.870354  


#### Ensambler - Hybrid

In [35]:
ret = db_k.as_retriever(search_kwargs={"k": 3})

In [36]:
from langchain.retrievers import EnsembleRetriever
ensemble_retriever_1 = EnsembleRetriever(retrievers=[retriever_bm25, ret], weights=[0.75, 0.25])
result_ensemble1, results_df = run_and_evaluate(f"Ensambler 1", ensemble_retriever_1, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble1)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   2%|▏         | 1/60 [00:02<01:59,  2.02s/it]Runner in Executor raised an exception
ValueError: Azure has not provided the response due to a content filter being triggered
Evaluating:  67%|██████▋   | 40/60 [00:05<00:01, 10.77it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating:  93%|█████████▎| 56/60 [00:07<00:00,  6.80it/s]Invalid JSON response. Expected dictionary with key 'Attributed'
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.48it/s]


The average score is: 0.8648139282427217
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 1      0.691669          0.979557           0.798251   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.822619           0.991231            0.905557  0.864814  


In [37]:
ensemble_retriever_2 = EnsembleRetriever(retrievers=[retriever_bm25, ret], weights=[0.5, 0.5])
result_ensemble2, results_df = run_and_evaluate(f"Ensambler 2", ensemble_retriever_2, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble2)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  97%|█████████▋| 58/60 [00:08<00:00,  9.34it/s]Invalid JSON response. Expected dictionary with key 'Attributed'
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.22it/s]


The average score is: 0.8733804689425805
This is the new best value!
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 2      0.987654          0.820264           0.800516   

   Context Recall  Answer Similarity  Answer Correctness  Average  
0             1.0           0.926481            0.705368  0.87338  


In [38]:
ensemble_retriever_3 = EnsembleRetriever(retrievers=[retriever_bm25, ret], weights=[0.25,0.75])
result_ensemble3, results_df = run_and_evaluate(f"Ensambler 3", ensemble_retriever_3, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble3)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  95%|█████████▌| 57/60 [00:08<00:00,  7.77it/s]Invalid JSON response. Expected dictionary with key 'Attributed'
Evaluating: 100%|██████████| 60/60 [00:24<00:00,  2.42it/s]


The average score is: 0.8600716511840957
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 3           0.9          0.831635           0.843738   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0             1.0           0.930265            0.654792  0.860072  


#### Multi-stage - reranker

In [None]:
os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")

In [None]:
retriever_context = retriever_mmr
compressor = CohereRerank(top_n=6)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever_context
)

result_compression = run_and_evaluate(compression_retriever, prompt, llm)
print("Reranker")
print(result_compression)
avg_result_compression = dictionary(result_compression)

#### creating context by remaking the query

In [51]:
highest = results_df.nlargest(1, "Average")
highest

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
25,"Chunk size 2000, overlap 10%, K=3",0.946429,0.91532,0.966667,1.0,0.943323,0.739982,0.91862


In [53]:
template_context = "Generate a search query to fetch the relevant documents using the user's {question}. Craft a query that specifically targets the keywords in the question. In the answer provide only the query."
prompt_context = ChatPromptTemplate.from_template(template_context)

In [54]:
answers_final = []
contexts_final = []
# retriever_context_q = EnsembleRetriever(retrievers=[retriever_bm25, retriever_3], weights=[0.5, 0.5])
llm_for_context =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt_context | llm}
)
for query in questions:
    response_check = llm_for_context.invoke({"question": query})
    search_query = response_check["response"].content
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("context"), "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
)
    docs = ret.get_relevant_documents(search_query)
    formatted_docs = []
    for doc in docs:
        resulting_doc = doc.page_content
        formatted_docs.append(resulting_doc)
    try:  
            response = retrieval_augmented_qa_chain.invoke({"context": formatted_docs, "question": query})
            # Access the response content
            answers_final.append(response["response"].content)
            contexts_final.append(formatted_docs)  
    except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_final.append("No answer")
            contexts_final.append(formatted_docs)


result_search_query = evaluation_rag(questions, answers_final, contexts_final, ground_truths)
result_search_query

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  18%|█▊        | 11/60 [00:02<00:07,  6.88it/s]Invalid JSON response. Expected dictionary with key 'question'
Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  6.69it/s]


{'faithfulness': 0.7441, 'answer_relevancy': 0.9269, 'context_precision': 0.8953, 'context_recall': 0.9333, 'answer_similarity': 0.8644, 'answer_correctness': 0.8067}

In [55]:
average = dictionary(result_search_query)
    # Create a dictionary to store the results
system_results = {
        "System": "Search query",
        "Faithfulness": result_search_query["faithfulness"],
        "Answer Relevancy": result_search_query["answer_relevancy"],
        "Context Precision": result_search_query["context_precision"],
        "Context Recall": result_search_query["context_recall"],
        "Answer Similarity": result_search_query["answer_similarity"],
        "Answer Correctness": result_search_query["answer_correctness"],
        "Average": average
    }
df_result_search_query = pd.DataFrame([system_results])

The average score is: 0.8617799007305408


In [56]:
results_df = pd.concat([results_df, df_result_search_query], ignore_index=True)
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.743175,,,0.915695,0.487775,0.701108
1,GPT-4,0.971429,0.723279,,,0.902019,0.484846,0.690262
2,Naive,0.686914,0.964286,0.874123,0.905556,0.930638,0.909474,0.878499
3,Recursive,0.669849,0.964286,0.885355,0.905556,0.930638,0.909474,0.877526
4,"Chunk 500, overlap 0%",0.953704,0.806519,0.805556,0.9,0.93744,0.683758,0.847829
5,"Chunk 500, overlap 5%",0.944444,0.933492,0.891667,0.9,0.947441,0.742642,0.893281
6,"Chunk 500, overlap 10%",0.705793,0.974965,0.80458,0.872222,0.875755,0.881225,0.852423
7,"Chunk 500, overlap 15%",0.752637,0.907425,0.939409,0.85,0.98129,0.825899,0.87611
8,"Chunk 500, overlap 20%",0.984127,0.931073,0.8,0.875,0.938601,0.722899,0.875283
9,"Chunk 1000, overlap 0%",1.0,0.811036,0.858333,0.8675,0.927681,0.667863,0.855402


### change model to GPT-4

In [53]:
result_k3_2000_10_gpt4, results_df = run_and_evaluate(f"Chunk size 2000, overlap 10%, K=3, GPT-4", ret, prompt, llm_gpt4, results_df)
result_k3_2000_10_gpt4

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.34it/s]


The average score is: 0.9182783048436894


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Chunk 1000, overlap 0%, GPT-4",0.944444,0.898766,0.983333,1.0,0.946864,0.736261,0.918278


In [56]:
replacement_map = {
    'Chunk 1000, overlap 0%, GPT-4': 'Chunk size 2000, overlap 10%, K=3, GPT-4'
}
results_df['System'] = results_df['System'].replace(replacement_map)

In [54]:
ret_5 = db_k.as_retriever(search_kwargs={"k": 5})

In [55]:
result_k5_2000_10_gpt4, results_df = run_and_evaluate(f"Chunk size 2000, overlap 10%, K=5, GPT-4", ret_5, prompt, llm_gpt4, results_df)
result_k5_2000_10_gpt4

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:11<00:00,  5.19it/s]


The average score is: 0.8969585598993436


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Chunk size 2000, overlap 10%, K=5, GPT-4",1.0,0.823842,0.88625,1.0,0.949949,0.72171,0.896959


In [57]:
ret_4 = db_k.as_retriever()

In [58]:
result_2000_10_gpt4, results_df = run_and_evaluate(f"Chunk 2000, overlap 10%, GPT-4", ret_4, prompt, llm_gpt4, results_df)
result_2000_10_gpt4

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  18%|█▊        | 11/60 [00:05<00:23,  2.05it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.91it/s]


The average score is: 0.8920871684905182


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Chunk 2000, overlap 10%, GPT-4",0.695043,0.953704,0.872279,0.938889,0.956041,0.936567,0.892087


In [58]:
results_df.to_csv(f"../papers/results/results_summarize.csv")